In [2]:
import gzip
import pickle

# Function to load the datasets

In [3]:
def load_and_unzip_txt_gz(file_path):
    filename ="subset.txt.gz"
    with gzip.open(file_path, 'rt', errors='ignore') as file:
        content = [line.strip() for line in file]
    return content

# Functions to do the Data Cleaning

In [6]:
# Count number of objects and links in the data
def count_objects_and_links(data):
    object_count = 0
    link_count = 0
    for element in data:
        if element.startswith('P'):
            object_count += 1
        elif element.startswith('L'):
            link_count += 1
    return object_count, link_count

#remove lines with more than n=40 links or without any text (Q):
def remove_objects_with_conditions(data):
    new_data = []
    i = 0
    removed_link_count = 0
    removed_no_q_count = 0

    while i < len(data):
        if data[i].startswith('P'):
            link_count = 0
            has_q = False
            j = i + 1
            while j < len(data) and not data[j].startswith('P'):
                if data[j].startswith('L'):
                    link_count += 1
                elif data[j].startswith('Q'):
                    has_q = True
                j += 1
            
            if link_count > 50:
                removed_link_count += 1
            elif not has_q:
                removed_no_q_count += 1
            else:
                new_data.extend(data[i:j])

            i = j
        else:
            i += 1

    return removed_link_count, removed_no_q_count, new_data

#remove links to pages which are not present as P
def remove_links_to_nonexistent_objects(data):
    new_data = []
    object_links = {line.split("\t")[1] for line in data if line.startswith("P")}
    n_removed_objects = 0

    i = 0
    while i < len(data):
        if data[i].startswith('P'):
            new_data.append(data[i])
            j = i + 1
            while j < len(data) and not data[j].startswith('P'):
                if data[j].startswith('L'):
                    link = data[j].split("\t")[1]
                    if link in object_links:
                        new_data.append(data[j])
                    else:
                        n_removed_objects += 1
                else:
                    new_data.append(data[j])
                j += 1
            i = j
        else:
            i += 1

    print(f"Number of removed links to pages outside of the dataset: {n_removed_objects}")
    return new_data

#remove objects without incoming or outgoing link
def remove_nolink_objects(data):
    linked_objects = set()

    for line in data:
        if line.startswith("L"):
            _, link = line.split("\t", 1)
            linked_objects.add(link)

    new_data = []
    current_object = []
    removed_objects_count = 0

    for line in data:
        if line.startswith("P"):
            if current_object:
                if current_object[0].split("\t", 1)[1] in linked_objects:
                    new_data.extend(current_object)
                else:
                    removed_objects_count += 1
            current_object = []
        current_object.append(line)

    if current_object:
        if current_object[0].split("\t", 1)[1] in linked_objects:
            new_data.extend(current_object)
        else:
            removed_objects_count += 1

    print(f"Removed objects without incoming or outgoing link: {removed_objects_count}")
    return new_data

In [7]:
def data_cleaning(data):
    object_count, link_count = count_objects_and_links(data)
    print(f"Number of objects in the list: {object_count}")
    print(f"Total number of links in the list: {link_count}")
    removed_link_count, removed_no_q_count, filtered_data = remove_objects_with_conditions(data)
    print(f"Removed objects with more than 40 links: {removed_link_count}")
    print(f"Removed objects with no 'Q': {removed_no_q_count}")
    filtered_data = remove_links_to_nonexistent_objects(filtered_data)
    filtered_data = remove_nolink_objects(filtered_data)
    return filtered_data

# Load and Clean Data

In [8]:
# Define a list of filenames
filenames = ['quotes_2008-08.txt.gz', 'quotes_2008-09.txt.gz', 'quotes_2008-10.txt.gz', 'quotes_2008-11.txt.gz', 'quotes_2008-12.txt.gz', 'quotes_2009-01.txt.gz', 'quotes_2009-02.txt.gz', 'quotes_2009-03.txt.gz']

# Define an empty list to store the results
content = []

# Loop through the filenames and process each file
for filename in filenames:
    print(filename)
    result = load_and_unzip_txt_gz(filename)
    # Append the result to the list of results
    result = data_cleaning(result)
    content.append(result)

quotes_2008-08.txt.gz
Number of objects in the list: 7340810
Total number of links in the list: 27819202
Removed objects with more than 40 links: 123658
Removed objects with no 'Q': 2512711
Number of removed links to pages outside of the dataset: 3304977
Removed objects without incoming or outgoing link: 4558956
quotes_2008-09.txt.gz
Number of objects in the list: 9464211
Total number of links in the list: 64387422
Removed objects with more than 40 links: 110099
Removed objects with no 'Q': 4812550
Number of removed links to pages outside of the dataset: 5570874
Removed objects without incoming or outgoing link: 4327657
quotes_2008-10.txt.gz
Number of objects in the list: 10024181
Total number of links in the list: 36700694
Removed objects with more than 40 links: 114201
Removed objects with no 'Q': 5355912
Number of removed links to pages outside of the dataset: 6198301
Removed objects without incoming or outgoing link: 4334483
quotes_2008-11.txt.gz
Number of objects in the list: 9012

# Save/load as pickle file

In [19]:
#flatten the list
content = [item for sublist in content for item in sublist]

# Save the list to a file
with open('content_full.pickle', 'wb') as f:
    pickle.dump(content, f)

In [11]:
# Load the list from the file
with open('content_full.pickle', 'rb') as f:
    content = pickle.load(f)

In [20]:
# Count number of objects and links in the data
count_objects_and_links(content)

(1672913, 2132525)