In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, accuracy_score
from sklearn.neural_network import MLPClassifier

In [2]:
val_df = pd.read_csv("./Saved_Predict_and_Proba/HateVal/prob_val_hateval.csv").dropna()
test_df = pd.read_csv("./Saved_Predict_and_Proba/HateVal/prob_test_hateval.csv").dropna()
probas_val, labels_val = val_df.drop(columns=["Unnamed: 0", "HS"]), val_df["HS"]
probas_test, labels_test = test_df.drop(columns=["Unnamed: 0", "HS"]), test_df["HS"]

In [3]:
def filter_df_train_test(train_df, test_df, name, filter_first=True):
    train = train_df.filter(regex=name, axis=1)
    test = test_df.filter(regex=name, axis=1)
    return train.to_numpy(), test.to_numpy()

def filter_collinearity(X_train, X_test):
    X_train = X_train[:, ::2]
    X_test = X_test[:, ::2]
    return X_train, X_test
stacking = LogisticRegression()

## Stacking group A - Varying Classification methods

In [4]:
algorithms_list = ['SVM', 'MLP', 'KNN', 'RF', 'EXTRA', 'CNN', 'LR', 'NB']
results_A = np.zeros(len(algorithms_list))

for idx_alg, algorithm in enumerate(algorithms_list):
    X_val, X_test = filter_df_train_test(probas_val, probas_test, algorithm)
    X_val, X_test = filter_collinearity(X_val, X_test)
    stacking.fit(X_val, labels_val)
    y_pred = stacking.predict(X_test)
    score = f1_score(labels_test, y_pred)
    results_A[idx_alg] = score

# Stacking group B - Varying Feature Representation methods

In [6]:
stacking = MLPClassifier(random_state=42, batch_size=128, verbose=100, activation='relu', solver='lbfgs', max_iter=1000)

fe_list = ['CV', 'TFIDF', 'W2V', 'GLOVE', 'FAST']
results_B = np.zeros(len(fe_list))
for idx_alg, fe in enumerate(fe_list):
    X_val, X_test = filter_df_train_test(probas_val, probas_test, fe)
    stacking.fit(X_val, labels_val)
    y_pred = stacking.predict(X_test)
    results_B[idx_alg] = f1_score(labels_test, y_pred)

# Stacking group C -  ALL techniques

In [8]:
results_C = 0
stacking.fit(probas_val.to_numpy(), labels_val)
y_pred = stacking.predict(probas_test.to_numpy())
results_C = f1_score(labels_test, y_pred)

# Stacking group D - Proposed Selection scheme

In [10]:
names = 'CNN-CV|CNN-FAST|RF-GLOVE|NB-GLOVE|SVM-W2V|MLP-GLOVE|CNN-W2V|SVM-FAST'

X_val, X_test = filter_df_train_test(probas_val, probas_test, names)
X_val, X_test = filter_collinearity(X_val, X_test)

stacking.fit(X_val, labels_val)
y_pred = stacking.predict(X_test)
results_D = f1_score(labels_test, y_pred)

## Summarizing results

In [12]:
algorithms_list_A = [alg + ' (Group A)' for alg in algorithms_list]
group_A_df = pd.DataFrame(results_A, columns=all_stacking_names, index=algorithms_list_A)

fe_list_B = [fe + ' (Group B)' for fe in fe_list]
group_B_df = pd.DataFrame(results_B, columns=all_stacking_names, index=fe_list_B)

group_B_df = pd.DataFrame(results_B.reshape(1, 1), columns=all_stacking_names, index=['Group B'])
group_C_df = pd.DataFrame(results_D.reshape(1, 1), columns=all_stacking_names, index=['Group C'])
group_D_df = pd.DataFrame(results_D.reshape(1, 1), columns=all_stacking_names, index=['Group D'])

NameError: name 'all_stacking_names' is not defined

In [None]:
from pprint import pprint
pprint(pd.concat([group_A_df, group_B_df, group_C_df, group_D_df]).round(4).to_latex(caption='dataset '+ dataset_name.upper()))

# best single for sanity check

failed

In [None]:
from sklearn.metrics import accuracy_score

probas = probas_test.filter(regex='LR-W2V')
y_pred = np.argmax(probas.to_numpy(), axis=1)

f1_score(labels_test, y_pred, average='macro')