In [1]:
import os
import numpy as np
import pandas as pd


DATA_DIR = "data" # This may need to be changed on different machines

# Make sure we're in the correct directory and make sure the data directory exists
if not os.path.exists(DATA_DIR):
    os.chdir("../..") # Move up two directories because we're in src/nb and the data directory/path should be in/start at the root directory 
    assert os.path.exists(DATA_DIR), f"ERROR: DATA_DIR={DATA_DIR} not found"  # If we still can't see the data directory something is wrong

# Import library things after changing directories
from src.lib.bpe_parser import read_bpe_data, read_int_to_token, decode_bpe_to_text

In [2]:
data_file = os.path.join(DATA_DIR, "datasets", "cds", "test.cds.csv")

df = pd.read_csv(data_file)
df

Unnamed: 0,label,text,paraphrase
0,joyce,"a mimograph at a time, numan bitter, with his ...","a mimograph, a numan bitter, with a false step..."
1,joyce,Swimming in my hindmoist.,I'm swimming in my butt.
2,joyce,selfprolonging tension of the thing proposed t...,the self-prolonging tension and self-abbreviat...
3,joyce,cod Im always getting enough for 3 forgetting ...,"I'm always getting enough for three of them, b..."
4,joyce,So I saw there was going to be a bit of a dust.,I've seen a little bit of dust.
...,...,...,...
393743,aae,yea the nigga from Romeo must die...he checked...,we had to kill Romeo when we were 10.
393744,aae,"okay now if he charge more that's on you, i wa...","okay, if he's going to charge more, I want to ..."
393745,aae,People out here trying to live double lives. K...,people who try to live a double life.
393746,aae,Even though me & her son are no longer an item...,even though we don't have anything to do with ...


In [3]:
# count the occurences of each unique label in df["label"]
smallest_class_size = df["label"].value_counts().min()
smallest_class_size

1293

In [4]:
# sample smallest_class_size random rows from each lebal
balanced_df = df.groupby("label").apply(lambda x: x.sample(smallest_class_size))
# remove the multiindex 
balanced_df = balanced_df.reset_index(drop=True)
# shuffle the dataframe
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)

assert (balanced_df["label"].value_counts() == smallest_class_size).all(), "ERROR: Sampled classes are not the same size as the smallest class"
balanced_df

Unnamed: 0,label,text,paraphrase
0,coha_1890,The imprisonment of Grotius was not the worst ...,Grotius was not the worst of all.
1,poetry,The unfettered sun takes his unbounded reign,the unfettered sun is free to reign
2,aae,lol srry fun question but keep your head up do...,"excuse me, but I'm sorry, but I'm sorry, but I'm"
3,coha_1890,"I tried to speak, but could not . ""","I'm trying to talk, but I can't."
4,poetry,"Dancing upon the waves, as if to please","dancing on the waves, as if they were happy"
...,...,...,...
14218,coha_1810,"A high railing ran, rough and irregular, along...","just as we were, the high railing was rough an..."
14219,shakespeare,"Mercy but murders, pardoning those that kill.","mercy, mercy, mercy, mercy."
14220,coha_1810,The house at which I proposed to stop was upwa...,the house I'd like to stop is a mile away.
14221,coha_1990,"The only thing is, Grandma's going a bit batty.","the only thing is, Grandma's a little crazy."


In [5]:
# save df
balanced_df.to_csv(os.path.join(DATA_DIR, "decoded_cds", "balanced", "test.csv"))