In [81]:
# Install packages
import pandas as pd
import os

In [84]:
# Set paths
WORKING_DIR = os.getcwd()
DATA_DIR = "data"
full_path = os.path.join(WORKING_DIR, DATA_DIR)
print(full_path)

/Users/melinaplakidis/Documents/Uni/HA-DeepLearning/data


In [85]:
# Read in train and test set files 
train = pd.read_csv(os.path.join(full_path,"train_dataset.csv"))
test = pd.read_csv(os.path.join(full_path,"test_dataset.csv"))
binary_train = pd.read_csv(os.path.join(full_path, "binary_train_dataset.csv"))
binary_test = pd.read_csv(os.path.join(full_path,"binary_test_dataset.csv"))
labels_to_sa_ids = {'ASSERTIVE': 0, 'COMOTH': 1, 'DIRECTIVE': 2, 'EXPRESSIVE': 3, 'UNSURE': 4}
labels_to_binary_ids = {'offensive': 0, 'other': 1}
labels_to_hs_ids = {'abuse': 0, 'explicit': 1, 'implicit': 2, 'insult': 3, 'other': 4, 'profanity': 5}
ids_to_sa_labels = {0: 'ASSERTIVE', 1: 'COMOTH', 2: 'DIRECTIVE', 3: 'EXPRESSIVE', 4: 'UNSURE'}
ids_to_hs_labels = {0: 'abuse', 1: 'explicit', 2: 'implicit', 3: 'insult', 4: 'other', 5: 'profanity'}
ids_to_binary_labels = {0: 'offensive', 1: 'other'}

In [69]:
# Define methods to get frequencies

def count_hs_labels(df, binary=True):
    label_counts = df["labels"].value_counts()
    count_dict = {}
    for i  in range(0, len(label_counts)):
        if binary:
            count_dict[ids_to_binary_labels[i]] = label_counts[i]
        else:
            count_dict[ids_to_hs_labels[i]] = label_counts[i]
    return count_dict

def count_sa_labels(df):
    texts = df["texts"].tolist()
    count_dict = {}
    for text in texts:
        splitted = text.split("[SEP]")
        for i in range(1,len(splitted)):
            if splitted[i] in labels_to_sa_ids:
                if splitted[i] in count_dict:
                    count_dict[splitted[i]] += 1
                else:
                    count_dict[splitted[i]] = 1
            else:
                continue
    return count_dict

In [78]:
# Create two dataframes for hs labels and sa labels 

def create_table(train, test, binary=True):
    if binary:
        hs_df = pd.DataFrame({"Offensiveness": list(count_hs_labels(test).keys()), "Test":list(count_hs_labels(test).values()), "Train": list(count_hs_labels(train).values())})
        sa_df = pd.DataFrame({"Speech Acts": list(count_sa_labels(test).keys()), "Test":list(count_sa_labels(test).values()), "Train": list(count_sa_labels(train).values())}) 
    else:
        hs_df = pd.DataFrame({"Offensiveness": list(count_hs_labels(test, binary=False).keys()), "Test":list(count_hs_labels(test, binary=False).values()), "Train": list(count_hs_labels(train, binary=False).values())})
        sa_df = pd.DataFrame({"Speech Acts": list(count_sa_labels(test).keys()), "Test":list(count_sa_labels(test).values()), "Train": list(count_sa_labels(train).values())}) 
    return hs_df, sa_df

In [79]:
hs_df, sa_df = create_table(train, test, binary=False)
print(hs_df.to_latex(index=False))
print(sa_df.to_latex(index=False))

\begin{tabular}{lrr}
\toprule
Offensiveness &  Test &  Train \\
\midrule
        abuse &    20 &     80 \\
     explicit &    20 &     80 \\
     implicit &    20 &     80 \\
       insult &    20 &     80 \\
        other &    20 &     80 \\
    profanity &    20 &     80 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrr}
\toprule
Speech Acts &  Test &  Train \\
\midrule
  DIRECTIVE &   123 &    507 \\
     UNSURE &    17 &    133 \\
 EXPRESSIVE &    82 &     71 \\
  ASSERTIVE &   144 &    520 \\
     COMOTH &    17 &    310 \\
\bottomrule
\end{tabular}



  print(hs_df.to_latex(index=False))
  print(sa_df.to_latex(index=False))
