In [21]:
from process_twarc.util import load_parquet
from modules.build_corpus import build_chunked_corpus
import pandas as pd
import os

##Define the variables.

In [22]:
#Paths to inputs and outputs.
paths = {
    "data": "../../data/tweets/3-tokenized",
    "corpus_output": "process",
    "tweet_ids": "../../data/corpus-analysis/ids/tweet_ids.parquet"
}
input_dir, output_dir, path_to_tweet_ids = paths.values()

#To pass to build_chunked_corpus. Let's build a corpus with 0.1% of the dataset, 1 time, with the corpus divided into 10 chunks.
parameters = {
    "sample_frac": 0.001,
    "num_epochs": 1,
    "num_chunks": 10
}
sample_frac, num_epochs, num_chunks = parameters.values()

##Excecute the procedure.

In [23]:
build_chunked_corpus(
    data_directory = input_dir,
    save_directory = output_dir,
    sample_frac=sample_frac,
    num_epochs=num_epochs,
    num_chunks=num_chunks
)

Loading dataset:   0%|                                                                         | 0/343 [00:00<?, ?it/s]

Loading dataset: 100%|███████████████████████████████████████████████████████████████| 343/343 [00:27<00:00, 12.30it/s]


Initiating epoch 1

Sampling 59570 tweets.
Corpus compiled and shuffled.
Generated folder for chunk001


Building subcorpus: 100%|███████████████████████████████████████████████████████| 5958/5958 [00:00<00:00, 13594.76it/s]


Generated folder for chunk002


Building subcorpus: 100%|███████████████████████████████████████████████████████| 5958/5958 [00:00<00:00, 10777.68it/s]


Generated folder for chunk003


Building subcorpus: 100%|███████████████████████████████████████████████████████| 5958/5958 [00:00<00:00, 10471.16it/s]


Generated folder for chunk004


Building subcorpus: 100%|███████████████████████████████████████████████████████| 5958/5958 [00:00<00:00, 12000.68it/s]


Generated folder for chunk005


Building subcorpus: 100%|███████████████████████████████████████████████████████| 5958/5958 [00:00<00:00, 12623.66it/s]


Generated folder for chunk006


Building subcorpus: 100%|███████████████████████████████████████████████████████| 5958/5958 [00:00<00:00, 12948.80it/s]


Generated folder for chunk007


Building subcorpus: 100%|███████████████████████████████████████████████████████| 5958/5958 [00:00<00:00, 13313.42it/s]


Generated folder for chunk008


Building subcorpus: 100%|███████████████████████████████████████████████████████| 5958/5958 [00:00<00:00, 11774.91it/s]


Generated folder for chunk009


Building subcorpus: 100%|███████████████████████████████████████████████████████| 5958/5958 [00:00<00:00, 12464.47it/s]


Generated folder for chunk010


Building subcorpus: 100%|███████████████████████████████████████████████████████| 5948/5948 [00:00<00:00, 12696.92it/s]


##Demonstrate that an error occured.

###Load the Tweet IDs.

In [24]:
def load_tweet_ids (path):
    return set(load_parquet(path)["tweet_id"])

all_ids = load_tweet_ids(path_to_tweet_ids)

print("Tweet IDs loaded.")
print("Total IDs:", len(all_ids))

Tweet IDs loaded.
Total IDs: 51116358


###Check the chunked subcorpus for tweet_ids that are absent from all_ids.

In [25]:
def get_chunks(output_dir):
    return os.listdir(output_dir)

def load_subcorpus (chunk):
    return pd.read_json(f"{output_dir}/{chunk}/subcorpus.jsonl", lines = True, encoding="utf-8")

def check_tweet_ids(subcorpus):
    output_ids = set(subcorpus["tweet_id"].astype("str").to_list())
    error_ids = output_ids.difference(all_ids)

    print()
    print("Total Tweets:", len(output_ids))
    print("Copied with error:", len(error_ids))
    print("Error Ratio: {:.2%}".format(len(error_ids)/len(all_ids)*100))
    print()
    return

chunks = get_chunks(output_dir)
for chunk in chunks:
    print(f"Checking {chunk}")
    subcorpus = load_subcorpus(chunk)
    check_tweet_ids(subcorpus)

Checking chunk001

Total Tweets: 5958
Copied with error: 3201
Error Ratio: 0.63%

Checking chunk002

Total Tweets: 5958
Copied with error: 3132
Error Ratio: 0.61%

Checking chunk003

Total Tweets: 5958
Copied with error: 3225
Error Ratio: 0.63%

Checking chunk004

Total Tweets: 5958
Copied with error: 3206
Error Ratio: 0.63%

Checking chunk005

Total Tweets: 5958
Copied with error: 3174
Error Ratio: 0.62%

Checking chunk006

Total Tweets: 5958
Copied with error: 3205
Error Ratio: 0.63%

Checking chunk007

Total Tweets: 5958
Copied with error: 3170
Error Ratio: 0.62%

Checking chunk008

Total Tweets: 5958
Copied with error: 3219
Error Ratio: 0.63%

Checking chunk009

Total Tweets: 5958
Copied with error: 3183
Error Ratio: 0.62%

Checking chunk010

Total Tweets: 5948
Copied with error: 3223
Error Ratio: 0.63%



When I run the procedure above,  I got almost 2/3 Error rate.

Rather then compiling the tweet_ids from all flagged duplicates, I compiled the text.

I suspsect that there were errors in copying over the text as well. However, as most duplicates are identical rather than near,
compiling text was still effective for filtering.

So, at the moment, this particular issue is not a high priority. However, I've got this iron this out before I can considered
publishing this code.