In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, accuracy_score

In [3]:
def load_predictions(dataset_name, partition, fold_number=1):
    path = './Saved_Predict_and_Proba/Folds/' + dataset_name.upper() + '/F' + str(fold_number)  + '/prob_' + partition +'.csv'
    table_pred = pd.read_csv(path)
    if dataset_name == 'zw':
        label = table_pred['norm']
        methods = table_pred.drop('norm', axis=1)
    else: 
        label = table_pred['class']
        methods = table_pred.drop('class', axis=1)
    methods = methods.drop('Unnamed: 0', axis=1)
    return label, methods

def load_dataset(dataset_name, fold_number):
    label_train, probas_train = load_predictions(dataset_name, 'train', fold_number)
    label_val, probas_val = load_predictions(dataset_name, 'val', fold_number)
    label_test, probas_test = load_predictions(dataset_name, 'test', fold_number)
    return label_train, probas_train, label_test, probas_test, label_val, probas_val

def filter_df_train_test(train_df, test_df, name, filter_first=True):
    train = train_df.filter(regex=name, axis=1)
    test = test_df.filter(regex=name, axis=1)
    return train, test

# Get dataset

In [4]:
dataset_name = 'td'
folds = 5
l1_ratios = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
stacking = LogisticRegression()

## Stacking group A - Varying Classification methods

In [5]:
algorithms_list = ['SVM', 'MLP', 'KNN', 'RF', 'EXTRA', 'CNN', 'LR', 'NB']
results_A = np.zeros((len(algorithms_list), folds))

for fold in range(folds):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    for idx_alg, algorithm in enumerate(algorithms_list):
        X_val, X_test = filter_df_train_test(probas_val, probas_test, algorithm)
        stacking.fit(X_val, label_val)
        y_pred = stacking.predict(X_test)
        score = f1_score(label_test, y_pred, average='macro')
        results_A[idx_alg, fold] = score

In [6]:
results_A

array([[0.72499785, 0.72738305, 0.72896891, 0.72817418, 0.71753791],
       [0.71712056, 0.70790836, 0.70759637, 0.6932822 , 0.70336251],
       [0.66245114, 0.63696903, 0.64712885, 0.65361636, 0.65255526],
       [0.64351756, 0.62109717, 0.6448892 , 0.646823  , 0.64578097],
       [0.62030735, 0.59478799, 0.61959701, 0.61052197, 0.62355493],
       [0.70989446, 0.72922028, 0.73669174, 0.70863952, 0.71167033],
       [0.71533912, 0.72193981, 0.72196937, 0.72927139, 0.72938323],
       [0.68561465, 0.67902618, 0.68002023, 0.68178027, 0.68183796]])

# Stacking group B - Varying Feature Representation methods

In [7]:
fe_list = ['CV', 'TFIDF', 'W2V', 'GLOVE', 'FAST']
results_B = np.zeros((len(fe_list), folds))
for fold in range(folds):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    
    for idx_alg, fe in enumerate(fe_list):
        X_val, X_test = filter_df_train_test(probas_val, probas_test, fe)
        stacking.fit(X_val, label_val)
        y_pred = stacking.predict(X_test)
        results_B[idx_alg, fold] = f1_score(label_test, y_pred, average='macro')

In [8]:
results_B

array([[0.71125274, 0.70869284, 0.7080715 , 0.69872621, 0.68131803],
       [0.66675735, 0.62997971, 0.65816042, 0.6565835 , 0.68335896],
       [0.61562245, 0.60806151, 0.61622149, 0.60488158, 0.61466371],
       [0.57021727, 0.55503841, 0.56580129, 0.56029737, 0.55898808],
       [0.57539992, 0.5956165 , 0.62581251, 0.60126891, 0.596084  ]])

# Stacking group C -  ALL techniques

In [9]:
results_C = np.zeros(folds)

for fold in range(folds):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    stacking.fit(probas_val, label_val)
    y_pred = stacking.predict(probas_test)
    results_C[fold] = f1_score(label_test, y_pred, average='macro')

In [10]:
results_C

array([0.63682819, 0.63148139, 0.6392035 , 0.63272825, 0.65272228])

# Stacking group D - Proposed Selection scheme

In [19]:
names_D = {}
names_D['zw'] = [
     'EXTRA-TFIDF|NB-W2V|NB-TF|KNN-GLOVE|MLP-W2V|CNN-W2V',
     'NB-FAST|CNN-W2V|MLP-GLOVE|SVM-TF|KNN-W2V|LR-TFIDF|EXTRA-TFIDF',
     'SVM-TF|RF-TFIDF|NB-GLOVE|KNN-FAST|NB-TFIDF|NB-W2V',
     'LR-W2V|NB-W2V|SVM-TFIDF|LR-GLOVE|LR-W2V',
     'LR-GLOVE|NB-W2V|SVM-TFIDF|EXTRA-TFIDF|MLP-FAST|LR-W2V',
                 ]


names_D['td'] = [
     'NB-GLOVE|NB-CV|SM-W2V|NB-FAST|SVM-TFIDF',
     'CNN-FAST|NB-FAST|LR-FAST|SVM-CV|MLP-GLOVE',
     'MLP-W2V|LR-FAST|SVM-CV|CNN-W2V|NB-TFIDF|NB-W2V',
     'SVM-W2V|RF-CV|SVM-TFIDF|NB-FAST|NB-CV',
     'RF-CV|CNN-TF|MLP-FAST|CNN-FAST|NB-TFIDF',
 ]
names_D ['union']= [
     'MLP-W2V|NB-TFIDF|RF-TFIDF|KNN-TFIDF|SVM-CV|NN-W2V|NN-FAST|CNN-W2V|SM-FAST',
     'MLP-TFIDF|NB-CV|SVM-CV|CNN-TF|KNN-TFIDF|CNN-FAST|XTRA-TFIDF',
     'KNN-TFIDF|RF-CV|SVM-W2V|LR-GLOVE|NB-TFIDF|KNN-FAST|KNN-GLOVE|SVM-CV',
     'SVM-TFIDF|KNN-TFIDF|CNN-W2V|KNN-FAST|SVM-FAST|EXTRA-CV|KNN-W2V|NB-TFIDF|SVM-GLOVE|LR-W2V|MLP-FAST',
     'CNN-W2V|RF-TFIDF|NB-W2V|NB-TFIDF|MLP-TFIDF|MLP-W2V',
]

names = names_D[dataset_name]
results_D = np.zeros(folds)

for fold in range(folds):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    X_val, X_test = filter_df_train_test(probas_val, probas_test, names[fold])
    stacking.fit(X_val, label_val)
    y_pred = stacking.predict(X_test)
    results_D[fold] = f1_score(label_test, y_pred, average='macro')

In [20]:
results_D.mean()

0.7293440073486361

## Summarizing results

In [13]:
# algorithms_list_A = [alg + ' (Group A)' for alg in algorithms_list]
# group_A_df = pd.DataFrame(results_A, columns=all_stacking_names, index=algorithms_list_A)

# fe_list_B = [fe + ' (Group B)' for fe in fe_list]
# group_B_df = pd.DataFrame(results_B, columns=all_stacking_names, index=fe_list_B)

# group_B_df = pd.DataFrame(results_B.reshape(1, 1), columns=all_stacking_names, index=['Group B'])
# group_C_df = pd.DataFrame(results_D.reshape(1, 1), columns=all_stacking_names, index=['Group C'])
# group_D_df = pd.DataFrame(results_D.reshape(1, 1), columns=all_stacking_names, index=['Group D'])

In [14]:
# from pprint import pprint
# pprint(pd.concat([group_A_df, group_B_df, group_C_df, group_D_df]).round(4).to_latex(caption='dataset '+ dataset_name.upper()))