In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, accuracy_score

In [3]:
def load_predictions(dataset_name, partition):
    path = './Saved Predict and Proba/' + dataset_name.upper() + '/prob_' + partition + '_' + dataset_name.lower() + '.csv'
    table_pred = pd.read_csv(path)
    if dataset_name == 'zw':
        label = table_pred['norm']
        methods = table_pred.drop('norm', axis=1)
    else: 
        label = table_pred['class']
        methods = table_pred.drop('class', axis=1)
    methods = methods.drop('Unnamed: 0', axis=1)
    return label, methods

def load_dataset(dataset_name):
    label_train, probas_train = load_predictions(dataset_name, 'train')
    label_val, probas_val = load_predictions(dataset_name, 'val')
    label_test, probas_test = load_predictions(dataset_name, 'test')
    return label_train, probas_train, label_test, probas_test, label_val, probas_val

def filter_df_train_test(train_df, test_df, name, filter_first=True):
    train = train_df.filter(regex=name, axis=1)
    test = test_df.filter(regex=name, axis=1)
    return train, test

# Get dataset

In [4]:
dataset_name = 'zw'
_, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name)
all_stacking = [LogisticRegressionCV(class_weight='balanced', cv=10, scoring='f1_macro', n_jobs=5)]
all_stacking_names = ['Stacking LR']

names_B = {}
names_B['zw'] = 'CNN-W2V|MLP-CV|SVM-TFIDF|KNN-GLOVE|LR-FAST|MLP-GLOVE|SVM-CV|CNN-CV|KNN-CV|MLP-FAST|RF-W2V|CNN-FAST|RF-FAST'
names_B['td'] = 'MLP-FAST|KNN-GLOVE|KNN-FAST|KNN-CV|MLP-W2V|NB-CV|CNN-TFIDF|NB-FAST|RF-CV|LR-CV|CNN-CV|KNN-W2V|CNN-GLOVE'
names_B ['td_zw']= 'EXTRA-CV|CNN-FAST|MLP-W2V|LR-CV|MLP-FAST|NB-CV|CNN-CV|LR-GLOVE|SVM-GLOVE|RF-FAST|RF-TFIDF|SVM-CV|KNN-CV'

## Stacking group A

In [1]:
algorithms_list = ['SVM', 'MLP', 'KNN', 'RF', 'EXTRA', 'CNN', 'LR', 'NB']
results_A = np.zeros((len(algorithms_list), len(all_stacking)))
for idx_alg, algorithm in enumerate(algorithms_list):
    X_val, X_test = filter_df_train_test(probas_val, probas_test, algorithm)
    for idx_clf, clf in enumerate(all_stacking):
        clf.fit(X_val, label_val)
        y_pred = clf.predict(X_test)
        score = f1_score(label_test, y_pred, average='macro')
        results_A[idx_alg, idx_clf] = score

NameError: name 'np' is not defined

# Stacking group E

In [6]:
fe_list = ['CV', 'TFIDF', 'W2V', 'GLOVE', 'FAST']
results_E = np.zeros((len(fe_list), len(all_stacking)))
for idx_alg, fe in enumerate(fe_list):
    X_val, X_test = filter_df_train_test(probas_val, probas_test, fe)
    for idx_clf, clf in enumerate(all_stacking):
        clf.fit(X_val, label_val)
        y_pred = clf.predict(X_test)
        results_E[idx_alg, idx_clf] = f1_score(label_test, y_pred, average='macro')

# Stacking group D

In [7]:
results_D = np.zeros(len(all_stacking))
for idx_clf, clf in enumerate(all_stacking):
    clf.fit(probas_val, label_val)
    y_pred = clf.predict(probas_test)
    results_D[idx_clf] = f1_score(label_test, y_pred, average='macro')
print(results_D[idx_clf])

0.7862534534627416


# Stacking group B

In [8]:
names = names_B[dataset_name]
results_B = np.zeros(len(all_stacking))
X_val, X_test = filter_df_train_test(probas_val, probas_test, names)
for idx_clf, clf in enumerate(all_stacking):
    clf.fit(X_val, label_val)
    y_pred = clf.predict(X_test)
    results_B[idx_clf] = f1_score(label_test, y_pred, average='macro')
print(results_B[idx_clf])

0.790878623420137


In [9]:
algorithms_list_A = [alg + ' (Group A)' for alg in algorithms_list]
group_A_df = pd.DataFrame(results_A, columns=all_stacking_names, index=algorithms_list_A)

fe_list_E = [fe + ' (Group E)' for fe in fe_list]
group_E_df = pd.DataFrame(results_E, columns=all_stacking_names, index=fe_list_E)

group_B_df = pd.DataFrame(results_B.reshape(1, 1), columns=all_stacking_names, index=['Group B'])
group_D_df = pd.DataFrame(results_D.reshape(1, 1), columns=all_stacking_names, index=['Group D'])

In [10]:
from pprint import pprint
pprint(pd.concat([group_A_df, group_B_df, group_D_df, group_E_df]).round(4).to_latex(caption='dataset '+ dataset_name.upper()))

('\\begin{table}\n'
 '\\centering\n'
 '\\caption{dataset ZW}\n'
 '\\begin{tabular}{lr}\n'
 '\\toprule\n'
 '{} &  Stacking LR \\\\\n'
 '\\midrule\n'
 'SVM (Group A)   &       0.7923 \\\\\n'
 'MLP (Group A)   &       0.7755 \\\\\n'
 'KNN (Group A)   &       0.6803 \\\\\n'
 'RF (Group A)    &       0.7664 \\\\\n'
 'EXTRA (Group A) &       0.7652 \\\\\n'
 'CNN (Group A)   &       0.7557 \\\\\n'
 'LR (Group A)    &       0.7790 \\\\\n'
 'NB (Group A)    &       0.7456 \\\\\n'
 'Group B         &       0.7909 \\\\\n'
 'Group D         &       0.7863 \\\\\n'
 'CV (Group E)    &       0.7807 \\\\\n'
 'TFIDF (Group E) &       0.7799 \\\\\n'
 'W2V (Group E)   &       0.7337 \\\\\n'
 'GLOVE (Group E) &       0.7208 \\\\\n'
 'FAST (Group E)  &       0.7429 \\\\\n'
 '\\bottomrule\n'
 '\\end{tabular}\n'
 '\\end{table}\n')


# best single for sanity check

In [11]:
from sklearn.metrics import accuracy_score

probas = probas_test.filter(regex='SVM-TF')
y_pred = np.argmax(probas.to_numpy(), axis=1)

f1_score(label_test, y_pred, average='macro')

0.7555745522584977