In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import spacy
import gensim
import nltk
import scipy
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report
)
from sklearn.model_selection import cross_validate
from sklearn.base import clone as sklearn_clone
from sklearn.model_selection import KFold

from tqdm import tqdm
tqdm.pandas()

import random

import sys
sys.path.append('..')

from helpers.model import (
    balance_data
)

## Load data

In [2]:
df_styl = pd.read_parquet('../datasets/used_data/02_classical_ml/04_01_political_polarization_10p_styllometric_features.parquet')
df_pos = pd.read_parquet('../datasets/used_data/02_classical_ml/04_02_political_polarization_10p_POS_ngrams.parquet')
df_ngram = pd.read_parquet('../datasets/used_data/02_classical_ml/04_03_political_polarization_10p_words_ngrams.parquet')

with open('../datasets/used_data/02_classical_ml/04_05_political_polarization_10p_herbert.npy', 'rb') as f:
    df_herbert = pd.DataFrame(np.load(f))

# with open('../datasets/used_data/02_classical_ml/04_05_political_polarization_10p_roberta.npy', 'rb') as f:
#     df_roberta = pd.DataFrame(np.load(f))

In [3]:
print(df_styl.shape, df_pos.shape, df_ngram.shape, df_herbert.shape)

(20000, 28) (20000, 4999) (20000, 6537) (20000, 1024)


In [4]:
y_train = df_styl['assestment']

In [5]:
y_train.value_counts()

assestment
0    12096
1     7904
Name: count, dtype: int64

In [6]:
df_styl.pop('assestment');
df_pos.pop('assestment');
df_ngram.pop('assestment');

## Make balanced datasets

In [7]:
X_train_u = df_styl
X_pos_u = df_pos 
X_ngram_u = df_ngram
X_herbert_u = df_herbert
# X_roberta_u = df_roberta

y_train_u = y_train 

In [8]:
words = X_ngram_u['TEXT_WORD'].str.split(' ').values
X_ngram_u.pop('TEXT_WORD');

## CV creation
### Topics

In [10]:
with open('../datasets/used_data/03_bert_like_models/03_topics_plytical_polarization.npy', 'rb') as f:
    topics = np.load(f).tolist()

y_train_u_topics = pd.DataFrame(y_train_u.copy())
y_train_u_topics['topic'] = topics
y_train_u_topics['n'] = 1

### Kfold

In [11]:
cv_fold = []
cv_fold_i = []

for i in y_train_u_topics['topic'].unique().reshape(10,-1):
    train_cv = X_train_u.index[ ~np.isin(y_train_u_topics["topic"], i) ].values
    test_cv = X_train_u.index[ np.isin(y_train_u_topics["topic"], i) ].values
    
    # train_cv_i = X_train_u.reset_index().index[ ~np.isin(X_train_u["topic"], i) ].values
    # test_cv_i = X_train_u.reset_index().index[ np.isin(X_train_u["topic"], i) ].values
    
    cv_fold.append( [train_cv, test_cv])
    # cv_fold_i.append( [train_cv_i, test_cv_i])

In [12]:
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X_train_u)

cv_Kfold = []
cv_Kfold_i = []

for train_index, test_index in kf.split(X_train_u):
    train_cv = X_train_u.iloc[ train_index, : ].index.values
    test_cv = X_train_u.iloc[ test_index, : ].index.values

    # train_cv_i= X_train_u.reset_index().iloc[ train_index, : ].index.values
    # test_cv_i = X_train_u.reset_index().iloc[ test_index, : ].index.values
    
    cv_Kfold.append( [train_cv, test_cv])
    # cv_Kfold_i.append( [train_cv_i, test_cv_i])

## Run experiments

In [13]:
def run_experiment(X, y, cv, clf_org, r_min=0.05):

    results = {
        'test_accuracy' : [],
        'test_precision' : [],
        'test_recall' : [],
        'test_f1' : [],
        'col_keep' : []
    }

    c_matrix = np.zeros((2,2))

    for train_cv, test_cv in cv:
        clf = sklearn_clone(clf_org)
        
        X_train_t = X[X.index.isin(train_cv)]
        y_train_t = y[y.index.isin(train_cv)]

        # keep only columns with corr > 0.05
        if r_min:
            col_keep = []
            for c in X_train_t.columns:
                min_v =X_train_t[c].values.min()
                max_v = X_train_t[c].values.max()
    
                if min_v < max_v:
                    r = scipy.stats.pearsonr(X_train_t[c].values, y_train_t)[0]
                    if ~np.isnan(r) and r > r_min:
                        col_keep.append(c)
            
            if len(col_keep) == 0:
                print('No values returned')
        
            X_train_t = X_train_t[col_keep]
        else:
            col_keep =  X_train_t.columns.values.tolist()


        X_test_t = X[X.index.isin(test_cv)]
        y_test_t = y[y.index.isin(test_cv)]
        
        if r_min:
            X_test_t = X_test_t[col_keep]

        clf.fit(X_train_t, y_train_t)

        y_pred = clf.predict(X_test_t)

        confusion = confusion_matrix(y_test_t, y_pred)
        c_matrix += confusion

    #     TN, FP = confusion[0, 0], confusion[0, 1]
    #     FN, TP = confusion[1, 0], confusion[1, 1]

        results['test_accuracy'].append( accuracy_score(y_test_t, y_pred) ) 
        results['test_precision'].append( precision_score(y_test_t, y_pred) ) 
        results['test_recall'].append( recall_score(y_test_t, y_pred) ) 
        results['test_f1'].append( f1_score(y_test_t, y_pred) ) 
        results['col_keep'].append( len(col_keep)) 
    
    metrics = {
        "Accuracy": np.array(results['test_accuracy']),
        "Precision": np.array(results['test_precision']).mean(),
        "Recall": np.array(results['test_recall']).mean(),
        "F1 Score":  np.array(results['test_f1']),
        "Cols used": np.array(results['col_keep']),
        }

#     print(c_matrix)

    return metrics

## Topics Kfold

In [14]:
random.seed(111)

In [17]:
params_xgb = {
    'n_estimators': 100,  # Number of boosting rounds
    'learning_rate': 0.05,  # Step size shrinkage to prevent overfitting
    'max_depth': 3,  # Maximum depth of a tree
    'min_child_weight': 1,  # Minimum sum of instance weight needed in a child
    'subsample': 0.9,  # Fraction of samples used for fitting the trees
    'colsample_bytree': 0.9,  # Fraction of features used for fitting the trees
    'gamma': 1,  # Minimum loss reduction required to make a further partition on a leaf node
    'random_state': 111,  # Seed for reproducibility
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
}

clf_xgb = xgb.XGBClassifier(**params_xgb)

In [18]:
for clf_used, clf_name in zip(
    [
        clf_xgb
    ],[
        'xgb  '
    ]
):
        
    for X_used, x_name, r_min in zip(
         [X_ngram_u, 
         X_ngram_u, 
         # X_ngram_u, 
         # X_ngram_u, 
         X_train_u, 
         X_pos_u, 
         X_pos_u, 
         # X_pos_u, 
         # X_pos_u, 
         X_herbert_u],
        ['ngrams  None', 
         # 'ngrams  0.01', 
         'ngrams  0.03', 
         # 'ngrams  0.05', 
         'features    ', 
         'pos     None', 
         # 'pos     0.01', 
         'pos     0.03', 
         # 'pos     0.05', 
         'herbert     '],
        [None, 
         # 0.01, 
         0.03,
         # 0.05, 
         None, 
         None, 
         # 0.01, 
         0.03, 
         # 0.05, 
         None]
    ):
        out = run_experiment(X_used, y_train_u, cv_fold, clf_used, r_min)
        print(
            x_name, 
            clf_name,
            f'Accuracy {out["Accuracy"].mean():.2f}+-{out["Accuracy"].std():.2f}',
            f'F1 Score {out["F1 Score"].mean():.2f}+-{out["F1 Score"].std():.2f}',
            f'Cols used {out["Cols used"].mean().round(0):.0f}+-{out["Cols used"].std().round(0):.0f}',
            # f'Cols used {len(out["Cols used"])}',
            # f'\n\tPrecision {out["Precision"].mean():.2f}+-{out["Precision"].std():.2f}',
            # f'\n\tRecall {out["Recall"].mean():.2f}+-{out["Recall"].std():.2f}',
            # f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
        )
        
    print()

ngrams  None xgb   Accuracy 0.79+-0.15 F1 Score 0.76+-0.17 Cols used 6535+-0
ngrams  0.03 xgb   Accuracy 0.75+-0.07 F1 Score 0.62+-0.14 Cols used 540+-63
features     xgb   Accuracy 0.69+-0.04 F1 Score 0.62+-0.11 Cols used 27+-0
pos     None xgb   Accuracy 0.61+-0.06 F1 Score 0.38+-0.09 Cols used 4998+-0
pos     0.03 xgb   Accuracy 0.60+-0.06 F1 Score 0.28+-0.07 Cols used 96+-23
herbert      xgb   Accuracy 0.90+-0.02 F1 Score 0.86+-0.07 Cols used 1024+-0



## Random Kfold

In [19]:
for clf_used, clf_name in zip(
    [
        clf_xgb
    ],[
        'xgb  '
    ]
):
        
    for X_used, x_name, r_min in zip(
        [X_ngram_u, 
         X_ngram_u, 
         # X_ngram_u, 
         # X_ngram_u, 
         X_train_u, 
         X_pos_u, 
         X_pos_u, 
         # X_pos_u, 
         # X_pos_u, 
         X_herbert_u],
        ['ngrams  None', 
         # 'ngrams  0.01', 
         'ngrams  0.03', 
         # 'ngrams  0.05', 
         'features    ', 
         'pos     None', 
         # 'pos     0.01', 
         'pos     0.03', 
         # 'pos     0.05', 
         'herbert     '],
        [None, 
         # 0.01, 
         0.03,
         # 0.05, 
         None, 
         None, 
         # 0.01, 
         0.03, 
         # 0.05, 
         None]
    ):
        out = run_experiment(X_used, y_train_u, cv_Kfold, clf_used, r_min)
        print(
            x_name, 
            clf_name,
            f'Accuracy {out["Accuracy"].mean():.2f}+-{out["Accuracy"].std():.2f}',
            f'F1 Score {out["F1 Score"].mean():.2f}+-{out["F1 Score"].std():.2f}',
            f'Cols used {out["Cols used"].mean().round(0):.0f}+-{out["Cols used"].std().round(0):.0f}',
            # f'Cols used {len(out["Cols used"])}',
            # f'\n\tPrecision {out["Precision"].mean():.2f}+-{out["Precision"].std():.2f}',
            # f'\n\tRecall {out["Recall"].mean():.2f}+-{out["Recall"].std():.2f}',
            # f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
        )
        
    print()

ngrams  None xgb   Accuracy 0.86+-0.01 F1 Score 0.83+-0.01 Cols used 6535+-0
ngrams  0.03 xgb   Accuracy 0.79+-0.01 F1 Score 0.68+-0.01 Cols used 532+-12
features     xgb   Accuracy 0.72+-0.01 F1 Score 0.68+-0.02 Cols used 27+-0
pos     None xgb   Accuracy 0.67+-0.01 F1 Score 0.43+-0.02 Cols used 4998+-0
pos     0.03 xgb   Accuracy 0.65+-0.01 F1 Score 0.31+-0.02 Cols used 82+-3
herbert      xgb   Accuracy 0.92+-0.01 F1 Score 0.90+-0.01 Cols used 1024+-0

