In [1]:
import os
import numpy as np
import json
import pandas as pd

from tqdm.notebook import tqdm


DATA_DIR = "data" # This may need to be changed on different machines

# Make sure we're in the correct directory and make sure the data directory exists
if not os.path.exists(DATA_DIR):
    os.chdir("../..") # Move up two directories because we're in src/nb and the data directory/path should be in/start at the root directory 
    assert os.path.exists(DATA_DIR), f"ERROR: DATA_DIR={DATA_DIR} not found"  # If we still can't see the data directory something is wrong

# Import library things after changing directories
from src.lib.bpe_parser import read_bpe_data, read_int_to_token, decode_bpe_to_text

In [2]:
bpe_data_path = os.path.join(DATA_DIR, "datasets", "cds", "tweets", "dev.input0.bpe")
vocab_path = os.path.join(DATA_DIR, "vocabs", "tweets.json")

In [3]:
bpe_data = read_bpe_data(bpe_data_path)
vocab_dict = read_int_to_token(vocab_path)
decoded_data = decode_bpe_to_text(bpe_data, vocab_dict)

In [4]:
# print first 10 decoded sentences
for i in range(10):
    print(decoded_data[i])

All Star Classic Game 1 Orlando 09 Game 1 - West Coast vs East
The official morning baddie is Oleana from Pokemon Sword and Shield
Very cute and also very, very sexy. Love your Heels too.
I drive all night to keep her warm
pastorobeds jesusdaily love.quotes ibphialeah #pray #prayer #f #hope #biblia #life
To all the girls out there who are genuinely nice & not bitches, ily
the only Disney princess
What do you mean our faves child rapists, Kemosabe?
Ffs Tommy, stop tryna make man cry
Sunny Side Up please... with extra Irene.


In [5]:
cds_dir = os.path.join(DATA_DIR, "datasets", "cds")
cds_types = os.listdir(cds_dir)
cds_types

['joyce',
 'poetry',
 'coha_1810',
 'tweets',
 'coha_1890',
 'coha_1990',
 'switchboard',
 'lyrics',
 'bible',
 'shakespeare',
 'aae']

In [6]:
# use this to specify which files we want to decode
filter_files = lambda file_name: file_name.endswith(".bpe") and file_name.startswith("dev")

for cds_type in tqdm(cds_types):
    cds_path = os.path.join(cds_dir, cds_type)
    cds_files = [file for file in os.listdir(cds_path) if filter_files(file)]
    for cds_file in cds_files:
        cds_file_path = os.path.join(cds_path, cds_file)
        cds_data = read_bpe_data(cds_file_path)
        cds_data_decoded = decode_bpe_to_text(cds_data, vocab_dict)
        
        df = pd.DataFrame({
            "text": cds_data_decoded,
            "label": [cds_type] * len(cds_data_decoded)
        })
        # save to csv cds_path/cds_file.csv
        base_name = os.path.splitext(cds_file)[0]
        df.to_csv(os.path.join(cds_path, base_name + ".csv"), index=False)
        

  0%|          | 0/11 [00:00<?, ?it/s]

In [15]:
filter_files = lambda file_name: file_name.startswith("dev.input0") and file_name.endswith(".csv")

df = None

for cds_type in tqdm(cds_types):
    cds_path = os.path.join(cds_dir, cds_type)
    csv_files = [file for file in os.listdir(cds_path) if filter_files(file)]
    for csv_file in csv_files:
        sub_df = pd.read_csv(os.path.join(cds_path, csv_file))
        if df is None:
            df = sub_df
        else:
            df = pd.concat([df, sub_df])

# save df in cds_path/all_data.csv
df.to_csv(os.path.join(cds_dir, "dev.cds.csv"), index=False)
df.head()

  0%|          | 0/11 [00:00<?, ?it/s]

Unnamed: 0,text,label
0,I had something for you,lyrics
1,There were times we had it all,lyrics
2,"Chiefly important, however, is the fact that t...",coha_1890
3,You stay 5 minutes away though lol,tweets
4,I was on the look out for the wolf,lyrics
