In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import spacy
import gensim
import nltk
import scipy
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report
)
from sklearn.model_selection import cross_validate
from sklearn.base import clone as sklearn_clone
from sklearn.model_selection import KFold

from tqdm import tqdm
tqdm.pandas()

import random

import sys
sys.path.append('..')

from helpers.model import (
    balance_data
)

## Load data

In [2]:
df_styl = pd.read_parquet('../datasets/used_data/02_classical_ml/03_01_statements_styllometric_features.parquet')
df_pos = pd.read_parquet('../datasets/used_data/02_classical_ml/03_02_statements_POS_ngrams.parquet')
df_ngram = pd.read_parquet('../datasets/used_data/02_classical_ml/03_03_statements_words_ngrams.parquet')

with open('../datasets/used_data/02_classical_ml/03_04_statements_herbert.npy', 'rb') as f:
    df_herbert = pd.DataFrame(np.load(f))

with open('../datasets/used_data/02_classical_ml/03_05_statements_roberta.npy', 'rb') as f:
    df_roberta = pd.DataFrame(np.load(f))

In [3]:
print(df_styl.shape, df_pos.shape, df_ngram.shape, df_herbert.shape)

(6529, 30) (6529, 2647) (6529, 2771) (6529, 1024)


In [4]:
y_train = df_styl['assestment']

In [5]:
y_train.value_counts()

assestment
1    3434
0    3095
Name: count, dtype: int64

In [6]:
df_styl.pop('assestment');
df_pos.pop('assestment');
df_ngram.pop('assestment');

## Make balanced datasets

In [7]:
X_train_u = df_styl
X_pos_u = df_pos 
X_ngram_u = df_ngram
X_herbert_u = df_herbert
X_roberta_u = df_roberta

y_train_u = y_train 

In [8]:
words = X_ngram_u['TEXT_WORD'].str.split(' ').values
X_ngram_u.pop('TEXT_WORD');

## CV creation
### Topics

In [12]:
with open('../datasets/used_data/03_bert_like_models/02_topics.npy', 'rb') as f:
    topics = np.load(f).tolist()

y_train_u_topics = pd.DataFrame(y_train_u.copy())
y_train_u_topics['topic'] = topics
y_train_u_topics['n'] = 1

### Kfold

In [13]:
cv_fold = []
cv_fold_i = []

for i in y_train_u_topics['topic'].unique().reshape(10,-1):
    train_cv = X_train_u.index[ ~np.isin(y_train_u_topics["topic"], i) ].values
    test_cv = X_train_u.index[ np.isin(y_train_u_topics["topic"], i) ].values
    
    # train_cv_i = X_train_u.reset_index().index[ ~np.isin(X_train_u["topic"], i) ].values
    # test_cv_i = X_train_u.reset_index().index[ np.isin(X_train_u["topic"], i) ].values
    
    cv_fold.append( [train_cv, test_cv])
    # cv_fold_i.append( [train_cv_i, test_cv_i])

In [14]:
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X_train_u)

cv_Kfold = []
cv_Kfold_i = []

for train_index, test_index in kf.split(X_train_u):
    train_cv = X_train_u.iloc[ train_index, : ].index.values
    test_cv = X_train_u.iloc[ test_index, : ].index.values

    # train_cv_i= X_train_u.reset_index().iloc[ train_index, : ].index.values
    # test_cv_i = X_train_u.reset_index().iloc[ test_index, : ].index.values
    
    cv_Kfold.append( [train_cv, test_cv])
    # cv_Kfold_i.append( [train_cv_i, test_cv_i])

## Run experiments

In [15]:
def run_experiment(X, y, cv, clf_org, r_min=0.05):

    results = {
        'test_accuracy' : [],
        'test_precision' : [],
        'test_recall' : [],
        'test_f1' : []
    }

    c_matrix = np.zeros((2,2))

    for train_cv, test_cv in cv:
        clf = sklearn_clone(clf_org)
        
        X_train_t = X[X.index.isin(train_cv)]
        y_train_t = y[y.index.isin(train_cv)]

        # keep only columns with corr > 0.05
        if r_min:
            col_keep = []
            for c in X_train_t.columns:
                min_v =X_train_t[c].values.min()
                max_v = X_train_t[c].values.max()
    
                if min_v < max_v:
                    r = scipy.stats.pearsonr(X_train_t[c].values, y_train_t)[0]
                    if ~np.isnan(r) and r > r_min:
                        col_keep.append(c)
            
            if len(col_keep) == 0:
                print('No values returned')
        
            X_train_t = X_train_t[col_keep]
        else:
            col_keep =  X_train_t.columns.values.tolist()


        X_test_t = X[X.index.isin(test_cv)]
        y_test_t = y[y.index.isin(test_cv)]
        
        if r_min:
            X_test_t = X_test_t[col_keep]

        clf.fit(X_train_t, y_train_t)

        y_pred = clf.predict(X_test_t)

        confusion = confusion_matrix(y_test_t, y_pred)
        c_matrix += confusion

    #     TN, FP = confusion[0, 0], confusion[0, 1]
    #     FN, TP = confusion[1, 0], confusion[1, 1]

        results['test_accuracy'].append( accuracy_score(y_test_t, y_pred) ) 
        results['test_precision'].append( precision_score(y_test_t, y_pred) ) 
        results['test_recall'].append( recall_score(y_test_t, y_pred) ) 
        results['test_f1'].append( f1_score(y_test_t, y_pred) ) 
    
    metrics = {
        "Accuracy": np.array(results['test_accuracy']),
        "Precision": np.array(results['test_precision']).mean(),
        "Recall": np.array(results['test_recall']).mean(),
        "F1 Score":  np.array(results['test_f1']),
        "Cols used": col_keep
        }

#     print(c_matrix)

    return metrics

## Topics Kfold

In [16]:
random.seed(111)

In [22]:
params_xgb = {
    'n_estimators': 300,  # Number of boosting rounds
    'learning_rate': 0.05,  # Step size shrinkage to prevent overfitting
    'max_depth': 5,  # Maximum depth of a tree
    'min_child_weight': 10,  # Minimum sum of instance weight needed in a child
    'subsample': 0.9,  # Fraction of samples used for fitting the trees
    'colsample_bytree': 0.9,  # Fraction of features used for fitting the trees
    'gamma': 1,  # Minimum loss reduction required to make a further partition on a leaf node
    'random_state': 111  # Seed for reproducibility
}

clf_xgb = xgb.XGBClassifier(**params_xgb)

In [23]:
for clf_used, clf_name in zip(
    [
        clf_xgb
    ],[
        'xgb  '
    ]
):
        
    for X_used, x_name, r_min in zip(
        [X_ngram_u, X_train_u, X_pos_u, X_herbert_u, X_roberta_u],
        ['ngrams  ', 'features', 'pos     ', 'herbert ', 'roberta '],
        [0.05, None, 0.05, None, None]
    ):
        out = run_experiment(X_used, y_train_u, cv_fold, clf_used, r_min)
        print(
            x_name, 
            clf_name,
            f'Accuracy {out["Accuracy"].mean():.2f}+-{out["Accuracy"].std():.2f}',
            f'F1 Score {out["F1 Score"].mean():.2f}+-{out["F1 Score"].std():.2f}',
            # f'Cols used {len(out["Cols used"])}',
            # f'\n\tPrecision {out["Precision"].mean():.2f}+-{out["Precision"].std():.2f}',
            # f'\n\tRecall {out["Recall"].mean():.2f}+-{out["Recall"].std():.2f}',
            # f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
        )
        
    print()

ngrams   xgb   Accuracy 0.49+-0.03 F1 Score 0.38+-0.17
features xgb   Accuracy 0.55+-0.02 F1 Score 0.60+-0.04
pos      xgb   Accuracy 0.52+-0.02 F1 Score 0.54+-0.03
herbert  xgb   Accuracy 0.69+-0.02 F1 Score 0.69+-0.07
roberta  xgb   Accuracy 0.70+-0.02 F1 Score 0.70+-0.07



## Random Kfold

In [24]:
for clf_used, clf_name in zip(
    [
        clf_xgb
    ],[
        'xgb  '
    ]
):
        
    for X_used, x_name, r_min in zip(
        [X_ngram_u, X_train_u, X_pos_u, X_herbert_u, X_roberta_u],
        ['ngrams  ', 'features', 'pos     ', 'herbert ', 'roberta '],
        [0.05, None, 0.05, None, None]
    ):
        out = run_experiment(X_used, y_train_u, cv_Kfold, clf_used, r_min)
        print(
            x_name, 
            clf_name,
            f'Accuracy {out["Accuracy"].mean():.2f}+-{out["Accuracy"].std():.2f}',
            f'F1 Score {out["F1 Score"].mean():.2f}+-{out["F1 Score"].std():.2f}',
            # f'Cols used {len(out["Cols used"])}',
            # f'\n\tPrecision {out["Precision"].mean():.2f}+-{out["Precision"].std():.2f}',
            # f'\n\tRecall {out["Recall"].mean():.2f}+-{out["Recall"].std():.2f}',
            # f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
        )
        
    print()

ngrams   xgb   Accuracy 0.52+-0.02 F1 Score 0.36+-0.16
features xgb   Accuracy 0.56+-0.01 F1 Score 0.60+-0.02
pos      xgb   Accuracy 0.54+-0.01 F1 Score 0.55+-0.03
herbert  xgb   Accuracy 0.70+-0.02 F1 Score 0.71+-0.02
roberta  xgb   Accuracy 0.71+-0.02 F1 Score 0.72+-0.02

