In [None]:
from process_twarc.util import concat_dataset, get_all_files
from modules.build_corpus import build_chunked_corpus 
import pandas as pd
import os

### Define the variables.

In [None]:
#Paths to inputs and outputs.
paths = {
    "data": "../../data/tweets/3-tokenized", #change this path to the Tweets folder
    "corpus_output": "process", # Make a path to the folder where you want to save your chunked subcorpus. It will generate the folder for you if it doens't exist.
}
input_dir, output_dir = paths.values()

#To pass to build_chunked_corpus. Let's build a corpus with 0.1% of the dataset, 1 time, with the corpus divided into 10 chunks.
parameters = {
    "sample_frac": 0.001,
    "num_epochs": 1,
    "num_chunks": 10
}
sample_frac, num_epochs, num_chunks = parameters.values()

### Excecute the procedure.

In [None]:
build_chunked_corpus(
    data_dir= input_dir,
    output_dir= output_dir,
    sample_frac=sample_frac,
    num_epochs=num_epochs,
    num_chunks=num_chunks
)

## Demonstrate that an error occured.

### From the dataset used to construct the subcorpus, generate a set of all the tweet_ids.

In [None]:
file_paths = get_all_files(input_dir) #Get all the filepaths from the input dir.

#Concatenate a single column dataset with just the tweet_ids
dataset = concat_dataset(
    file_paths=file_paths,
    output_type="Dataset",
    columns="tweet_id"
)

all_ids = set(dataset["tweet_id"])
print()
print(f"Total Tweets: {len(all_ids)}")
print(f"Sample: {list(all_ids)[:5]}")

### Check the chunked subcorpus for tweet_ids that are absent from all_ids.

In [None]:
def get_chunks(output_dir):
    return os.listdir(output_dir)

def load_subcorpus (chunk):
    return pd.read_json(f"{output_dir}/{chunk}/subcorpus.jsonl", lines = True, encoding="utf-8")

def check_tweet_ids(subcorpus):
    output_ids = set(subcorpus["tweet_id"].astype("str").to_list())
    error_ids = output_ids.difference(all_ids)

    print()
    print("Total Tweets:", len(output_ids))
    print("Copied with error:", len(error_ids))
    print("Error Ratio: {:.2%}".format(len(error_ids)/len(all_ids)*100))
    print("Sample of Erroneous IDs", list(error_ids)[:5])
    print()
    return

chunks = get_chunks(output_dir)
for chunk in chunks:
    print(f"Checking {chunk}")
    subcorpus = load_subcorpus(chunk)
    check_tweet_ids(subcorpus)

When I run the procedure above,  I got more then 40% Error rate.

Rather then compiling the tweet_ids from all flagged duplicates, I compiled the text.

I suspsect that there were errors in copying over the text as well. However, as most duplicates are identical rather than near,
compiling text was still effective for filtering.

So, at the moment, this particular issue is not a high priority. However, I've got this iron this out before I can considered
publishing this code.