In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, accuracy_score
from sklearn.neural_network import MLPClassifier

In [2]:
train_df = pd.read_csv("./Saved_Predict_and_Proba/HateVal/prob_train_hateval.csv").dropna()
val_df = pd.read_csv("./Saved_Predict_and_Proba/HateVal/prob_val_hateval.csv").dropna()
test_df = pd.read_csv("./Saved_Predict_and_Proba/HateVal/prob_test_hateval.csv").dropna()

probas_train, labels_train = train_df.drop(columns=["Unnamed: 0", "HS"]), train_df["HS"]
probas_val, labels_val = val_df.drop(columns=["Unnamed: 0", "HS"]), val_df["HS"]
probas_test, labels_test = test_df.drop(columns=["Unnamed: 0", "HS"]), test_df["HS"]

In [36]:
def filter_df_train_test(train_df, test_df, name, filter_first=True):
    train = train_df.filter(regex=name, axis=1)
    test = test_df.filter(regex=name, axis=1)
    return train.to_numpy(), test.to_numpy()

def filter_collinearity(X_train, X_test):
    X_train = X_train[:, ::2]
    X_test = X_test[:, ::2]
    return X_train, X_test
stacking = LogisticRegression()


## Stacking group A - Varying Classification methods

In [137]:
algorithms_list = ['SVM', 'MLP', 'KNN', 'RF', 'EXTRA', 'CNN', 'LR', 'NB']
stacking = LogisticRegression(class_weight='balanced')

results_A = np.zeros(len(algorithms_list))

for idx_alg, algorithm in enumerate(algorithms_list):
    X_val, X_test = filter_df_train_test(probas_val, probas_test, algorithm)
    X_val, X_test = filter_collinearity(X_val, X_test)
    stacking.fit(X_val, labels_val)
    y_pred = stacking.predict(X_test)
    score = f1_score(labels_test, y_pred)
    results_A[idx_alg] = score

In [138]:
results_A

array([0.61454545, 0.62395242, 0.56133942, 0.58724249, 0.592245  ,
       0.59079284, 0.60772922, 0.60150376])

# Stacking group B - Varying Feature Representation methods

In [135]:
fe_list = ['CV', 'TFIDF', 'W2V', 'GLOVE', 'FAST']
stacking = LogisticRegression(class_weight='balanced')

results_B = np.zeros(len(fe_list))
for idx_alg, fe in enumerate(fe_list):
    X_val, X_test = filter_df_train_test(probas_val, probas_test, fe)
    stacking.fit(X_val, labels_val)
    y_pred = stacking.predict(X_test)
    results_B[idx_alg] = f1_score(labels_test, y_pred)

In [136]:
results_B

array([0.59364133, 0.59231905, 0.62061463, 0.60233593, 0.61457233])

# Stacking group C -  ALL techniques

In [145]:
results_C = 0
stacking = LogisticRegression()
stacking.fit(probas_val.to_numpy(), labels_val)
y_pred = stacking.predict(probas_test.to_numpy())
results_C = f1_score(labels_test, y_pred)

In [146]:
results_C

0.5952747817154597

# Stacking group D - Proposed Selection scheme

In [147]:
names = 'MLP-GLOVE|MLP-FAST|SVM-CV|MLP-W2V|SVM-W2V|LR-GLOVE|RF-W2V|CNN-W2V'

X_val, X_test = filter_df_train_test(probas_val, probas_test, names)
stacking.fit(X_val, labels_val)
y_pred = stacking.predict(X_test)
results_D = f1_score(labels_test, y_pred)

In [148]:
results_D

0.6211318650683022

## Summarizing results

In [149]:
all_stacking_names = ['Logistic Regression']
algorithms_list_A = [alg + ' (Group A)' for alg in algorithms_list]
group_A_df = pd.DataFrame(results_A, columns=all_stacking_names, index=algorithms_list_A)

fe_list_B = [fe + ' (Group B)' for fe in fe_list]
group_B_df = pd.DataFrame(results_B, columns=all_stacking_names, index=fe_list_B)

group_C_df = pd.DataFrame(results_C.reshape(1, 1), columns=all_stacking_names, index=['Group C'])
group_D_df = pd.DataFrame(results_D.reshape(1, 1), columns=all_stacking_names, index=['Group D'])

In [150]:
from pprint import pprint
pprint(pd.concat([group_A_df, group_B_df, group_C_df, group_D_df]).round(4).to_latex())

('\\begin{tabular}{lr}\n'
 '\\toprule\n'
 '{} &  Logistic Regression \\\\\n'
 '\\midrule\n'
 'SVM (Group A)   &               0.6145 \\\\\n'
 'MLP (Group A)   &               0.6240 \\\\\n'
 'KNN (Group A)   &               0.5613 \\\\\n'
 'RF (Group A)    &               0.5872 \\\\\n'
 'EXTRA (Group A) &               0.5922 \\\\\n'
 'CNN (Group A)   &               0.5908 \\\\\n'
 'LR (Group A)    &               0.6077 \\\\\n'
 'NB (Group A)    &               0.6015 \\\\\n'
 'CV (Group B)    &               0.5936 \\\\\n'
 'TFIDF (Group B) &               0.5923 \\\\\n'
 'W2V (Group B)   &               0.6206 \\\\\n'
 'GLOVE (Group B) &               0.6023 \\\\\n'
 'FAST (Group B)  &               0.6146 \\\\\n'
 'Group C         &               0.5953 \\\\\n'
 'Group D         &               0.6211 \\\\\n'
 '\\bottomrule\n'
 '\\end{tabular}\n')


# best single for sanity check


In [None]:
from sklearn.metrics import accuracy_score

probas = probas_test.filter(regex='LR-W2V')
y_pred = np.argmax(probas.to_numpy(), axis=1)

f1_score(labels_test, y_pred)