In [1]:
# Install packages
import pandas as pd
import os
import collections

In [2]:
# Set paths
WORKING_DIR = os.getcwd()
DATA_DIR = "data"
full_path = os.path.join(WORKING_DIR, DATA_DIR)
print(full_path)

/Users/melinaplakidis/Documents/Uni/HA-DeepLearning/data


In [3]:
# Read in train and test set files (comment out the ones that are not needed)
multi_train = pd.read_csv(os.path.join(full_path,"multi_train_v1_dataset.csv"))
multi_test = pd.read_csv(os.path.join(full_path,"multi_test_v1_dataset.csv"))
binary_train = pd.read_csv(os.path.join(full_path, "binary_train_v1_dataset.csv"))
binary_test = pd.read_csv(os.path.join(full_path,"binary_test_v1_dataset.csv"))

In [None]:
# Create label mapping (comment out the ones that are not needed)
labels_to_sa_ids = {'ASSERTIVE': 0, 'COMOTH': 1, 'DIRECTIVE': 2, 'EXPRESSIVE': 3, 'UNSURE': 4}
labels_to_binary_ids = {'offensive': 0, 'other': 1}
labels_to_hs_ids = {'abuse': 0, 'explicit': 1, 'implicit': 2, 'insult': 3, 'other': 4, 'profanity': 5}
ids_to_sa_labels = {0: 'ASSERTIVE', 1: 'COMOTH', 2: 'DIRECTIVE', 3: 'EXPRESSIVE', 4: 'UNSURE'}
ids_to_hs_labels = {0: 'abuse', 1: 'explicit', 2: 'implicit', 3: 'insult', 4: 'other', 5: 'profanity'}
ids_to_binary_labels = {0: 'offensive', 1: 'other'}

In [4]:
# Methods to get frequencies

# Count hate speech labels
def count_hs_labels(df, binary=True): # if fine-grained labels are evaluated, set binary=False
    label_counts = df["labels"].value_counts()
    count_dict = {}
    for i  in range(0, len(label_counts)):
        if binary:
            count_dict[ids_to_binary_labels[i]] = label_counts[i]
        else:
            count_dict[ids_to_hs_labels[i]] = label_counts[i]
    return count_dict

# Count speech act labels
def count_sa_labels(df): 
    texts = df["texts"].tolist()
    count_dict = {}
    for text in texts:
        splitted = text.split("[SEP]")
        for i in range(1,len(splitted)):
            if splitted[i] in labels_to_sa_ids:
                if splitted[i] in count_dict:
                    count_dict[splitted[i]] += 1
                else:
                    count_dict[splitted[i]] = 1
            else:
                continue
    ordered = collections.OrderedDict(sorted(count_dict.items()))
    return ordered


In [5]:
# Methods to create final tables

# Create two dataframes for hate speech and speech act labels
def create_table(train, test, binary=True): # if fine-grained labels are evaluated, set binary=False
    if binary:
        hs_df = pd.DataFrame({"Offensiveness": list(count_hs_labels(test).keys()), "Test":list(count_hs_labels(test).values()), "Train": list(count_hs_labels(train).values())})
        sa_df = pd.DataFrame({"Speech Acts": list(labels_to_sa_ids.keys()), "Test":list(count_sa_labels(test).values()), "Train": list(count_sa_labels(train).values())}) 
    else:
        hs_df = pd.DataFrame({"Offensiveness": list(count_hs_labels(test, binary=False).keys()), "Test":list(count_hs_labels(test, binary=False).values()), "Train": list(count_hs_labels(train, binary=False).values())})
        sa_df = pd.DataFrame({"Speech Acts": list(labels_to_sa_ids.keys()), "Test":list(count_sa_labels(test).values()), "Train": list(count_sa_labels(train).values())}) 
    return hs_df, sa_df

# Add relative frequencies to dataframe
def add_relative_frequencies(df):
    rel_train, rel_test = [], []
    for train, test in zip(df["Train"], df["Test"]):
        total = train + test
        rel_train.append((train/total)*100)
        rel_test.append((test/total)*100)
    df["Rel_Test"] = rel_test
    df["Rel_Train"] = rel_train
    return df

In [6]:
# Create two dataframes to get frequencies of speech act and hate speech labels
hs_df, sa_df = create_table(multi_train, multi_test, binary=False)
# Add relative frequencies to dataframes
hs_df = add_relative_frequencies(hs_df)
sa_df = add_relative_frequencies(sa_df)
# Print latex tables
print(hs_df.to_latex(index=False))
print(sa_df.to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
Offensiveness &  Test &  Train &  Rel\_Test &  Rel\_Train \\
\midrule
        abuse &    20 &     80 &      20.0 &       80.0 \\
     explicit &    20 &     80 &      20.0 &       80.0 \\
     implicit &    20 &     80 &      20.0 &       80.0 \\
       insult &    20 &     80 &      20.0 &       80.0 \\
        other &    20 &     80 &      20.0 &       80.0 \\
    profanity &    20 &     80 &      20.0 &       80.0 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrrr}
\toprule
Speech Acts &  Test &  Train &  Rel\_Test &  Rel\_Train \\
\midrule
  ASSERTIVE &   144 &    520 & 21.686747 &  78.313253 \\
     COMOTH &    17 &     71 & 19.318182 &  80.681818 \\
  DIRECTIVE &   123 &    507 & 19.523810 &  80.476190 \\
 EXPRESSIVE &    82 &    310 & 20.918367 &  79.081633 \\
     UNSURE &    17 &    133 & 11.333333 &  88.666667 \\
\bottomrule
\end{tabular}



  print(hs_df.to_latex(index=False))
  print(sa_df.to_latex(index=False))
