In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, accuracy_score

In [3]:
def load_predictions(dataset_name, partition, fold_number=1):
    path = './Saved_Predict_and_Proba/Folds/' + dataset_name.upper() + '/F' + str(fold_number)  + '/prob_' + partition +'.csv'
    table_pred = pd.read_csv(path)
    if dataset_name == 'zw':
        label = table_pred['norm']
        methods = table_pred.drop('norm', axis=1)
    else: 
        label = table_pred['class']
        methods = table_pred.drop('class', axis=1)
    methods = methods.drop('Unnamed: 0', axis=1)
    return label, methods

def load_dataset(dataset_name, fold_number):
    label_train, probas_train = load_predictions(dataset_name, 'train', fold_number)
    label_val, probas_val = load_predictions(dataset_name, 'val', fold_number)
    label_test, probas_test = load_predictions(dataset_name, 'test', fold_number)
    return label_train, probas_train, label_test, probas_test, label_val, probas_val

def filter_df_train_test(train_df, test_df, name, filter_first=True):
    train = train_df.filter(regex=name, axis=1)
    test = test_df.filter(regex=name, axis=1)
    return train, test

# Get dataset

In [4]:
dataset_name = 'zw'
folds = 5
stacking = LogisticRegression(class_weight='balanced')

## Stacking group A - Varying Classification methods

In [5]:
algorithms_list = ['SVM', 'MLP', 'KNN', 'RF', 'EXTRA', 'CNN', 'LR', 'NB']
results_A = np.zeros((len(algorithms_list), folds))

for fold in range(folds):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    for idx_alg, algorithm in enumerate(algorithms_list):
        X_val, X_test = filter_df_train_test(probas_val, probas_test, algorithm)
        stacking.fit(X_val, label_val)
        y_pred = stacking.predict(X_test)
        score = f1_score(label_test, y_pred, average='macro')
        results_A[idx_alg, fold] = score

In [6]:
results_A.mean(axis=1)

array([0.78522593, 0.75878708, 0.58487545, 0.77322025, 0.75855527,
       0.7592998 , 0.77574546, 0.71447546])

# Stacking group B - Varying Feature Representation methods

In [7]:
fe_list = ['CV', 'TFIDF', 'W2V', 'GLOVE', 'FAST']
results_B = np.zeros((len(fe_list), folds))
for fold in range(folds):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    
    for idx_alg, fe in enumerate(fe_list):
        X_val, X_test = filter_df_train_test(probas_val, probas_test, fe)
        stacking.fit(X_val, label_val)
        y_pred = stacking.predict(X_test)
        results_B[idx_alg, fold] = f1_score(label_test, y_pred, average='macro')

In [8]:
results_B.mean(axis=1)

array([0.74545394, 0.77126745, 0.74028171, 0.66948325, 0.70326888])

# Stacking group C -  ALL techniques

In [9]:
results_C = np.zeros(folds)

for fold in range(folds):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    stacking.fit(probas_val, label_val)
    y_pred = stacking.predict(probas_test)
    results_C[fold] = f1_score(label_test, y_pred, average='macro')

In [10]:
results_C.mean()

0.7664351699131046

# Stacking group D - Proposed Selection scheme

In [19]:
names_D = {}
names_D['zw'] = [
     'EXTRA-TFIDF|NB-W2V|NB-TF|KNN-GLOVE|MLP-W2V|CNN-W2V',
     'NB-FAST|CNN-W2V|MLP-GLOVE|SVM-TF|KNN-W2V|LR-TFIDF|EXTRA-TFIDF',
     'SVM-TF|RF-TFIDF|NB-GLOVE|KNN-FAST|NB-TFIDF|NB-W2V',
     'LR-W2V|NB-W2V|SVM-TFIDF|LR-GLOVE|LR-W2V',
     'LR-GLOVE|NB-W2V|SVM-TFIDF|EXTRA-TFIDF|MLP-FAST|LR-W2V',
                 ]


names_D['td'] = [
     'NB-GLOVE|NB-CV|SM-W2V|NB-FAST|SVM-TFIDF',
     'CNN-FAST|NB-FAST|LR-FAST|SVM-CV|MLP-GLOVE',
     'MLP-W2V|LR-FAST|SVM-CV|CNN-W2V|NB-TFIDF|NB-W2V',
     'SVM-W2V|RF-CV|SVM-TFIDF|NB-FAST|NB-CV',
     'RF-CV|CNN-CV|MLP-FAST|CNN-FAST|NB-TFIDF',
 ]
names_D ['union']= [
     'MLP-W2V|NB-TFIDF|RF-TFIDF|KNN-TFIDF|SVM-CV|NN-W2V|NN-FAST|CNN-W2V|SM-FAST',
     'MLP-TFIDF|NB-CV|SVM-CV|CNN-TF|KNN-TFIDF|CNN-FAST|XTRA-TFIDF',
     'KNN-TFIDF|RF-CV|SVM-W2V|LR-GLOVE|NB-TFIDF|KNN-FAST|KNN-GLOVE|SVM-CV',
     'SVM-TFIDF|KNN-TFIDF|CNN-W2V|KNN-FAST|SVM-FAST|EXTRA-CV|KNN-W2V|NB-TFIDF|SVM-GLOVE|LR-W2V|MLP-FAST',
     'CNN-W2V|RF-TFIDF|NB-W2V|NB-TFIDF|MLP-TFIDF|MLP-W2V',
]

names = names_D[dataset_name]
results_D = np.zeros(folds)

for fold in range(folds):
    labels_train, probas_train, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    X_val, X_test = filter_df_train_test(probas_train, probas_test, names[fold])
    stacking.fit(X_val, labels_train)
    y_pred = stacking.predict(X_test)
    results_D[fold] = f1_score(label_test, y_pred, average='macro')

In [20]:
results_D.mean()

0.7753397725251462

## Summarizing results

In [13]:
all_stacking_names = ['Logistic Regression']
algorithms_list_A = [alg + ' (Group A)' for alg in algorithms_list]
group_A_df = pd.DataFrame(results_A.mean(axis=1), columns=all_stacking_names, index=algorithms_list_A)

fe_list_B = [fe + ' (Group B)' for fe in fe_list]
group_B_df = pd.DataFrame(results_B.mean(axis=1), columns=all_stacking_names, index=fe_list_B)

group_C_df = pd.DataFrame(results_C.mean(axis=0).reshape(1, 1), columns=all_stacking_names, index=['Group C'])
group_D_df = pd.DataFrame(results_D.mean(axis=0).reshape(1, 1), columns=all_stacking_names, index=['Group D'])

In [14]:
from pprint import pprint
pprint(pd.concat([group_A_df, group_B_df, group_C_df, group_D_df]).round(4).to_latex(caption='dataset '+ dataset_name.upper()))

('\\begin{table}\n'
 '\\centering\n'
 '\\caption{dataset ZW}\n'
 '\\begin{tabular}{lr}\n'
 '\\toprule\n'
 '{} &  Logistic Regression \\\\\n'
 '\\midrule\n'
 'SVM (Group A)   &               0.7852 \\\\\n'
 'MLP (Group A)   &               0.7588 \\\\\n'
 'KNN (Group A)   &               0.5849 \\\\\n'
 'RF (Group A)    &               0.7732 \\\\\n'
 'EXTRA (Group A) &               0.7586 \\\\\n'
 'CNN (Group A)   &               0.7593 \\\\\n'
 'LR (Group A)    &               0.7757 \\\\\n'
 'NB (Group A)    &               0.7145 \\\\\n'
 'CV (Group B)    &               0.7455 \\\\\n'
 'TFIDF (Group B) &               0.7713 \\\\\n'
 'W2V (Group B)   &               0.7403 \\\\\n'
 'GLOVE (Group B) &               0.6695 \\\\\n'
 'FAST (Group B)  &               0.7033 \\\\\n'
 'Group C         &               0.7664 \\\\\n'
 'Group D         &               0.7753 \\\\\n'
 '\\bottomrule\n'
 '\\end{tabular}\n'
 '\\end{table}\n')
