In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, accuracy_score

In [3]:
def load_predictions(dataset_name, partition, fold_number=1):
    path = './Saved_Predict_and_Proba/Folds/' + dataset_name.upper() + '/F' + str(fold_number)  + '/prob_' + partition +'.csv'
    table_pred = pd.read_csv(path)
    if dataset_name == 'zw':
        label = table_pred['norm']
        methods = table_pred.drop('norm', axis=1)
    else: 
        label = table_pred['class']
        methods = table_pred.drop('class', axis=1)
    methods = methods.drop('Unnamed: 0', axis=1)
    return label, methods

def load_dataset(dataset_name, fold_number):
    label_train, probas_train = load_predictions(dataset_name, 'train', fold_number)
    label_val, probas_val = load_predictions(dataset_name, 'val', fold_number)
    label_test, probas_test = load_predictions(dataset_name, 'test', fold_number)
    return label_train, probas_train, label_test, probas_test, label_val, probas_val

def filter_df_train_test(train_df, test_df, name, filter_first=True):
    train = train_df.filter(regex=name, axis=1)
    test = test_df.filter(regex=name, axis=1)
    return train, test

# Get dataset

In [4]:
dataset_name = 'union'
folds = 5
l1_ratios = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
stacking = LogisticRegressionCV(class_weight='balanced', l1_ratios=l1_ratios,
                                scoring='f1_macro', cv=10, n_jobs=-1, penalty='elasticnet', solver='saga')

## Stacking group A - Varying Classification methods

In [5]:
algorithms_list = ['SVM', 'MLP', 'KNN', 'RF', 'EXTRA', 'CNN', 'LR', 'NB']
results_A = np.zeros((len(algorithms_list), folds))

for fold in range(folds):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    for idx_alg, algorithm in enumerate(algorithms_list):
        X_val, X_test = filter_df_train_test(probas_val, probas_test, algorithm)
        stacking.fit(X_val, label_val)
        y_pred = stacking.predict(X_test)
        score = f1_score(label_test, y_pred, average='macro')
        results_A[idx_alg, fold] = score

In [6]:
results_A
np.save('results_A_'+dataset_name, results_A)

# Stacking group B - Varying Feature Representation methods

In [7]:
fe_list = ['CV', 'TFIDF', 'W2V', 'GLOVE', 'FAST']
results_B = np.zeros((len(fe_list), folds))
for fold in range(folds):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    
    for idx_alg, fe in enumerate(fe_list):
        X_val, X_test = filter_df_train_test(probas_val, probas_test, fe)
        stacking.fit(X_val, label_val)
        y_pred = stacking.predict(X_test)
        results_B[idx_alg, fold] = f1_score(label_test, y_pred, average='macro')

In [8]:
results_B
np.save('results_B_'+dataset_name, results_B)

# Stacking group C -  ALL techniques

In [9]:
results_C = np.zeros(folds)

for fold in range(folds):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
    stacking.fit(probas_val, label_val)
    y_pred = stacking.predict(probas_test)
    results_C[fold] = f1_score(label_test, y_pred, average='macro')

In [10]:
results_C
np.save('results_C_'+dataset_name, results_C)

# Stacking group D - Proposed Selection scheme

In [11]:
# names_B = {}
# names_B['zw'] = [
#     'CNN-W2V|MLP-CV|SVM-TFIDF|KNN-GLOVE|LR-FAST|MLP-GLOVE|SVM-CV|CNN-CV|KNN-CV|MLP-FAST|RF-W2V|CNN-FAST|RF-FAST',
#     'CNN-W2V|MLP-CV|SVM-TFIDF|KNN-GLOVE|LR-FAST|MLP-GLOVE|SVM-CV|CNN-CV|KNN-CV|MLP-FAST|RF-W2V|CNN-FAST|RF-FAST',
#     'CNN-W2V|MLP-CV|SVM-TFIDF|KNN-GLOVE|LR-FAST|MLP-GLOVE|SVM-CV|CNN-CV|KNN-CV|MLP-FAST|RF-W2V|CNN-FAST|RF-FAST',
#     'CNN-W2V|MLP-CV|SVM-TFIDF|KNN-GLOVE|LR-FAST|MLP-GLOVE|SVM-CV|CNN-CV|KNN-CV|MLP-FAST|RF-W2V|CNN-FAST|RF-FAST',
#     'CNN-W2V|MLP-CV|SVM-TFIDF|KNN-GLOVE|LR-FAST|MLP-GLOVE|SVM-CV|CNN-CV|KNN-CV|MLP-FAST|RF-W2V|CNN-FAST|RF-FAST',
#                 ]


# names_B['td'] = [
#     'MLP-FAST|KNN-GLOVE|KNN-FAST|KNN-CV|MLP-W2V|NB-CV|CNN-TFIDF|NB-FAST|RF-CV|LR-CV|CNN-CV|KNN-W2V|CNN-GLOVE',
#     'MLP-FAST|KNN-GLOVE|KNN-FAST|KNN-CV|MLP-W2V|NB-CV|CNN-TFIDF|NB-FAST|RF-CV|LR-CV|CNN-CV|KNN-W2V|CNN-GLOVE',
#     'MLP-FAST|KNN-GLOVE|KNN-FAST|KNN-CV|MLP-W2V|NB-CV|CNN-TFIDF|NB-FAST|RF-CV|LR-CV|CNN-CV|KNN-W2V|CNN-GLOVE',
#     'MLP-FAST|KNN-GLOVE|KNN-FAST|KNN-CV|MLP-W2V|NB-CV|CNN-TFIDF|NB-FAST|RF-CV|LR-CV|CNN-CV|KNN-W2V|CNN-GLOVE',
#     'MLP-FAST|KNN-GLOVE|KNN-FAST|KNN-CV|MLP-W2V|NB-CV|CNN-TFIDF|NB-FAST|RF-CV|LR-CV|CNN-CV|KNN-W2V|CNN-GLOVE',
# ]
# names_B ['td_zw']= [
#     'EXTRA-CV|CNN-FAST|MLP-W2V|LR-CV|MLP-FAST|NB-CV|CNN-CV|LR-GLOVE|SVM-GLOVE|RF-FAST|RF-TFIDF|SVM-CV|KNN-CV',
#     'EXTRA-CV|CNN-FAST|MLP-W2V|LR-CV|MLP-FAST|NB-CV|CNN-CV|LR-GLOVE|SVM-GLOVE|RF-FAST|RF-TFIDF|SVM-CV|KNN-CV',
#     'EXTRA-CV|CNN-FAST|MLP-W2V|LR-CV|MLP-FAST|NB-CV|CNN-CV|LR-GLOVE|SVM-GLOVE|RF-FAST|RF-TFIDF|SVM-CV|KNN-CV',
#     'EXTRA-CV|CNN-FAST|MLP-W2V|LR-CV|MLP-FAST|NB-CV|CNN-CV|LR-GLOVE|SVM-GLOVE|RF-FAST|RF-TFIDF|SVM-CV|KNN-CV',
#     'EXTRA-CV|CNN-FAST|MLP-W2V|LR-CV|MLP-FAST|NB-CV|CNN-CV|LR-GLOVE|SVM-GLOVE|RF-FAST|RF-TFIDF|SVM-CV|KNN-CV',
#                    ]

# names = names_B[dataset_name]
# results_D = np.zeros(folds)

# for fold in range(folds):
#     _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name, fold+1)
#     X_val, X_test = filter_df_train_test(probas_val, probas_test, names[fold])
#     stacking.fit(X_val, label_val)
#     y_pred = stacking.predict(X_test)
#     results_D[fold] = f1_score(label_test, y_pred, average='macro')

## Summarizing results

In [12]:
# algorithms_list_A = [alg + ' (Group A)' for alg in algorithms_list]
# group_A_df = pd.DataFrame(results_A, columns=all_stacking_names, index=algorithms_list_A)

# fe_list_B = [fe + ' (Group B)' for fe in fe_list]
# group_B_df = pd.DataFrame(results_B, columns=all_stacking_names, index=fe_list_B)

# group_B_df = pd.DataFrame(results_B.reshape(1, 1), columns=all_stacking_names, index=['Group B'])
# group_C_df = pd.DataFrame(results_D.reshape(1, 1), columns=all_stacking_names, index=['Group C'])
# group_D_df = pd.DataFrame(results_D.reshape(1, 1), columns=all_stacking_names, index=['Group D'])

In [13]:
# from pprint import pprint
# pprint(pd.concat([group_A_df, group_B_df, group_C_df, group_D_df]).round(4).to_latex(caption='dataset '+ dataset_name.upper()))