In [1]:
import os
import numpy as np
import json
import pandas as pd

from tqdm.notebook import tqdm


DATA_DIR = "data" # This may need to be changed on different machines

# Make sure we're in the correct directory and make sure the data directory exists
if not os.path.exists(DATA_DIR):
    os.chdir("../..") # Move up two directories because we're in src/nb and the data directory/path should be in/start at the root directory 
    assert os.path.exists(DATA_DIR), f"ERROR: DATA_DIR={DATA_DIR} not found"  # If we still can't see the data directory something is wrong

# Import library things after changing directories
from src.lib.bpe_parser import read_bpe_data, read_int_to_token, decode_bpe_to_text

# Verify the functions to manipulate BPE data seem to work

In [2]:
bpe_data_path = os.path.join(DATA_DIR, "datasets", "cds", "tweets", "dev.input0.bpe")
vocab_path = os.path.join(DATA_DIR, "vocabs", "tweets.json")

In [3]:
bpe_data = read_bpe_data(bpe_data_path)
vocab_dict = read_int_to_token(vocab_path)
decoded_data = decode_bpe_to_text(bpe_data, vocab_dict)

In [4]:
print(type(decoded_data))
print(decoded_data[0])

<class 'list'>
All Star Classic Game 1 Orlando 09 Game 1 - West Coast vs East


In [5]:
# print first 10 decoded sentences
for i in range(10):
    print(decoded_data[i])

All Star Classic Game 1 Orlando 09 Game 1 - West Coast vs East
The official morning baddie is Oleana from Pokemon Sword and Shield
Very cute and also very, very sexy. Love your Heels too.
I drive all night to keep her warm
pastorobeds jesusdaily love.quotes ibphialeah #pray #prayer #f #hope #biblia #life
To all the girls out there who are genuinely nice & not bitches, ily
the only Disney princess
What do you mean our faves child rapists, Kemosabe?
Ffs Tommy, stop tryna make man cry
Sunny Side Up please... with extra Irene.


# Generate csv files for each category in the cds data

In [6]:
cds_dir = os.path.join(DATA_DIR, "datasets", "cds")
cds_types = [dir_name for dir_name in os.listdir(cds_dir) if os.path.isdir(os.path.join(cds_dir, dir_name))]
cds_types

['joyce',
 'poetry',
 'coha_1810',
 'tweets',
 'coha_1890',
 'coha_1990',
 'switchboard',
 'lyrics',
 'bible',
 'shakespeare',
 'aae']

In [8]:
# use this to specify which files we want to decode
filter_files = lambda file_name: file_name.endswith(".bpe") and file_name.startswith("train")

for cds_type in tqdm(cds_types):
    cds_path = os.path.join(cds_dir, cds_type)
    cds_files = [file for file in os.listdir(cds_path) if filter_files(file)]
    for cds_file in cds_files:
        cds_file_path = os.path.join(cds_path, cds_file)
        cds_data = read_bpe_data(cds_file_path)

        vocab_path = os.path.join(DATA_DIR, "vocabs", f"{cds_type}.json")
        vocab_dict = read_int_to_token(vocab_path)

        cds_data_decoded = decode_bpe_to_text(cds_data, vocab_dict)
        
        df = pd.DataFrame({
            "text": cds_data_decoded,
            "label": [cds_type] * len(cds_data_decoded)
        })
        # save to csv cds_path/cds_file.csv
        base_name = os.path.splitext(cds_file)[0]
        df.to_csv(os.path.join(cds_path, base_name + ".csv"), index=False)
        

  0%|          | 0/11 [00:00<?, ?it/s]

# Concatenate all the csv files into one big csv file

In [12]:
prefix = "dev" # Valid values: dev, train, test
filter_files = lambda file_name: file_name.startswith(prefix) and file_name.endswith(".csv")

df = None

for cds_type in tqdm(cds_types):
    cds_path = os.path.join(cds_dir, cds_type)
    csv_files = [file for file in os.listdir(cds_path) if filter_files(file)]
    assert len(csv_files) == 2, f"ERROR: Expected 2 csv files for {cds_type}, found {len(csv_files)}"

    input_file_name = [file for file in csv_files if ".input" in file][0]
    paraphrase_file_name = [file for file in csv_files if "paraphrase" in file][0]

    input_df = pd.read_csv(os.path.join(cds_path, input_file_name))

    paraphrase_df = pd.read_csv(os.path.join(cds_path, paraphrase_file_name))

    input_df["paraphrase"] = paraphrase_df["text"]

    # oder columns label, text, paraphrase
    input_df = input_df[["label", "text", "paraphrase"]]

    if df is None:
        df = input_df
    else:
        df = pd.concat([df, input_df])

# save df in cds_path/all_data.csv
df.to_csv(os.path.join(cds_dir, f"{prefix}.cds.csv"), index=False)
print(len(df))
df.head()

  0%|          | 0/11 [00:00<?, ?it/s]

393727


Unnamed: 0,label,text,paraphrase
0,joyce,in short circuit.,"in short, the circuit was broken."
1,joyce,Botlettle I thought sheâĢĻd act that loa.,I thought she would have done it.
2,joyce,Round him peered Lenehan.,he looked at Lenehan round.
3,joyce,Do you pay rent for this tower?,you're paying rent for the tower?
4,joyce,HeâĢĻs from beyant Boyne water.,he's from the water in the Boyne.
