In [27]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import spacy

import gensim
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report
)

import nltk

import scipy
from sklearn.model_selection import cross_validate


from tqdm import tqdm
tqdm.pandas()


from sklearn.base import clone as sklearn_clone
from sklearn.model_selection import KFold


import random

In [28]:
UNDERSAMPLING = True

## Load data

In [29]:
df_styl = pd.read_parquet('../datasets/used_data/02_classical_ml/02_01_benchmark_styllometric_features.parquet')
df_pos = pd.read_parquet('../datasets/used_data/02_classical_ml/02_02_benchmark_POS_ngrams.parquet')
df_ngram = pd.read_parquet('../datasets/used_data/02_classical_ml/02_03_benchmark_words_ngrams.parquet')

In [30]:
print(df_styl.shape, df_pos.shape, df_ngram.shape)

(2409, 28) (2409, 4650) (2409, 1174)


In [31]:
y_train = df_styl['assestment']

In [32]:
df_styl.pop('assestment');
df_pos.pop('assestment');
df_ngram.pop('assestment');

## Make balanced datasets

In [33]:
n_0 = y_train.value_counts()[0]
n_1 = y_train.value_counts()[1]

In [34]:
n_lower = y_train.value_counts().min()
n_upper = y_train.value_counts().max()

np.random.seed(111)

if UNDERSAMPLING:
    # undersampling    
    index_0 = np.random.choice(y_train[y_train==0].index, n_lower, replace=False)
    index_1 = np.random.choice(y_train[y_train==1].index, n_lower, replace=False)

    y_train_u = y_train.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()
    
    X_train_u = df_styl.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()
    X_pos_u = df_pos.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()
    X_ngram_u = df_ngram.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()
else:
    # oversampling
    if n_0 < n_1:
        index_0 = np.random.choice(y_train[y_train==0].index, n_1, replace=True)
        index_1 = np.random.choice(y_train[y_train==1].index, n_1, replace=False)
    else:
        index_0 = np.random.choice(y_train[y_train==0].index, n_0, replace=False)
        index_1 = np.random.choice(y_train[y_train==1].index, n_0, replace=True)

    y_train_u = y_train.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()
    
    X_train_u = df_styl.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()
    X_pos_u = df_pos.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()
    X_ngram_u = df_ngram.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

## CV creation
### LDA

In [25]:
ideal_topic_num = 10

In [37]:
words = X_ngram_u['TEXT_WORD'].str.split(' ').values

In [57]:
X_ngram_u.pop('TEXT_WORD');

In [39]:
dictionary = gensim.corpora.Dictionary(words)

bow_corpus = [dictionary.doc2bow(doc) for doc in words]


lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = ideal_topic_num, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   random_state=111,
                                   workers = 7)

topics = []

for line in tqdm(words):
    line_bow = dictionary.doc2bow(line)
    doc_lda = lda_model[line_bow]
    
    topics.append( max(doc_lda, key=lambda x:x[1])[0] )

# X_train_u['topic'] = topics

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1296/1296 [00:00<00:00, 9512.89it/s]


In [40]:
x = lda_model.show_topics(num_topics=ideal_topic_num, num_words=5)

for topic,word in x:
    print(topic, word)

0 0.020*"milion" + 0.016*"być" + 0.013*"polska" + 0.008*"rok" + 0.007*"budżet"
1 0.016*"być" + 0.009*"rok" + 0.009*"prezydent" + 0.007*"kraj" + 0.007*"europa"
2 0.022*"miliard" + 0.021*"polska" + 0.011*"pkb" + 0.010*"rok" + 0.010*"wzrost"
3 0.012*"polska" + 0.010*"miejsce" + 0.009*"dziecko" + 0.007*"europa" + 0.006*"milion"
4 0.009*"polski" + 0.008*"kaczyński" + 0.008*"rok" + 0.007*"ustawa" + 0.007*"rząd"
5 0.010*"the" + 0.009*"kraj" + 0.008*"polska" + 0.007*"osoba" + 0.007*"europejski"
6 0.015*"rok" + 0.009*"polska" + 0.008*"europejski" + 0.007*"rząd" + 0.007*"procent"
7 0.022*"być" + 0.012*"rok" + 0.012*"procent" + 0.009*"milion" + 0.007*"złoty"
8 0.013*"rok" + 0.013*"ustawa" + 0.012*"milion" + 0.011*"polska" + 0.009*"projekt"
9 0.016*"europejski" + 0.016*"unia" + 0.013*"tysiąc" + 0.010*"polska" + 0.009*"państwo"


In [42]:
y_train_u_topics = pd.DataFrame(y_train_u.copy())
y_train_u_topics['topic'] = topics
y_train_u_topics['n'] = 1
y_train_u_topics.groupby(['topic', 'assestment']).sum().reset_index().pivot(index='topic',columns='assestment',values='n')

assestment,0,1
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,60,79
1,64,49
2,89,80
3,42,41
4,70,54
5,50,66
6,73,64
7,60,74
8,94,82
9,46,59


### Kfold

In [44]:
cv_fold = []
cv_fold_i = []

for i in y_train_u_topics['topic'].unique().reshape(10,-1):
    train_cv = X_train_u.index[ ~np.isin(y_train_u_topics["topic"], i) ].values
    test_cv = X_train_u.index[ np.isin(y_train_u_topics["topic"], i) ].values
    
    # train_cv_i = X_train_u.reset_index().index[ ~np.isin(X_train_u["topic"], i) ].values
    # test_cv_i = X_train_u.reset_index().index[ np.isin(X_train_u["topic"], i) ].values
    
    cv_fold.append( [train_cv, test_cv])
    # cv_fold_i.append( [train_cv_i, test_cv_i])

In [45]:
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X_train_u)

cv_Kfold = []
cv_Kfold_i = []

for train_index, test_index in kf.split(X_train_u):
    train_cv = X_train_u.iloc[ train_index, : ].index.values
    test_cv = X_train_u.iloc[ test_index, : ].index.values

    # train_cv_i= X_train_u.reset_index().iloc[ train_index, : ].index.values
    # test_cv_i = X_train_u.reset_index().iloc[ test_index, : ].index.values
    
    cv_Kfold.append( [train_cv, test_cv])
    # cv_Kfold_i.append( [train_cv_i, test_cv_i])

## Run experiments

In [73]:
def run_experiment(X, y, cv, clf_org, r_min=0.05):

    results = {
        'test_accuracy' : [],
        'test_precision' : [],
        'test_recall' : [],
        'test_f1' : []
    }

    c_matrix = np.zeros((2,2))

    for train_cv, test_cv in cv:
        clf = sklearn_clone(clf_org)
        
        X_train_t = X[X.index.isin(train_cv)]
        y_train_t = y[y.index.isin(train_cv)]

        # keep only columns with corr > 0.05
        if r_min>0:
            col_keep = []
            for c in X_train_t.columns:
                min_v =X_train_t[c].values.min()
                max_v = X_train_t[c].values.max()
    
                if min_v < max_v:
                    r = scipy.stats.pearsonr(X_train_t[c].values, y_train_t)[0]
                    if ~np.isnan(r) and r > r_min:
                        col_keep.append(c)
            
            if len(col_keep) == 0:
                print('No values returned')
        
            X_train_t = X_train_t[col_keep]


        X_test_t = X[X.index.isin(test_cv)]
        y_test_t = y[y.index.isin(test_cv)]
        
        if r_min>0:
            X_test_t = X_test_t[col_keep]

        clf.fit(X_train_t, y_train_t)

        y_pred = clf.predict(X_test_t)

        confusion = confusion_matrix(y_test_t, y_pred)
        c_matrix += confusion

    #     TN, FP = confusion[0, 0], confusion[0, 1]
    #     FN, TP = confusion[1, 0], confusion[1, 1]

        results['test_accuracy'].append( accuracy_score(y_test_t, y_pred) ) 
        results['test_precision'].append( precision_score(y_test_t, y_pred) ) 
        results['test_recall'].append( recall_score(y_test_t, y_pred) ) 
        results['test_f1'].append( f1_score(y_test_t, y_pred) ) 

    metrics = {
        "Accuracy": np.array(results['test_accuracy']),
    #     "Precision": np.array(results['test_precision']).mean(),
    #     "Recall": np.array(results['test_recall']).mean(),
        "F1 Score":  np.array(results['test_f1']),
        }

#     print(c_matrix)

    return metrics

## Topics Kfold

In [67]:
random.seed(111)

In [68]:
clf_lr_1 = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear')
clf_lr_01 = LogisticRegression(max_iter=5000, C=0.1, penalty='l2', solver='liblinear')
clf_rf = RandomForestClassifier(random_state=111, max_depth=5)
clf_xgb = xgb.XGBClassifier(objective='binary:logistic')

In [69]:
for clf_used, clf_name in zip(
    [clf_lr_1, clf_rf, clf_xgb],['lr C1', 'rf d5', 'xgb  ']
):
        
    for X_used, x_name in zip(
        [X_ngram_u, X_train_u, X_pos_u],
        ['ngrams  ', 'features', 'pos     ']
    ):
        out = run_experiment(X_used, y_train_u, cv_fold, clf_used, 0.05)
        print(
            x_name, 
            clf_name,
            f'Accuracy {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f}',
            f'F1 Score {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}',
            f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
        )
        
    print()

ngrams   lr C1 Accuracy 0.540+-0.038 F1 Score 0.432+-0.082  0.540+-0.038 | 0.432+-0.082
features lr C1 Accuracy 0.509+-0.042 F1 Score 0.512+-0.054  0.509+-0.042 | 0.512+-0.054
pos      lr C1 Accuracy 0.544+-0.051 F1 Score 0.526+-0.048  0.544+-0.051 | 0.526+-0.048

ngrams   rf d5 Accuracy 0.522+-0.039 F1 Score 0.379+-0.122  0.522+-0.039 | 0.379+-0.122
features rf d5 Accuracy 0.466+-0.035 F1 Score 0.492+-0.055  0.466+-0.035 | 0.492+-0.055
pos      rf d5 Accuracy 0.528+-0.044 F1 Score 0.452+-0.037  0.528+-0.044 | 0.452+-0.037

ngrams   xgb   Accuracy 0.516+-0.025 F1 Score 0.384+-0.108  0.516+-0.025 | 0.384+-0.108
features xgb   Accuracy 0.479+-0.039 F1 Score 0.474+-0.039  0.479+-0.039 | 0.474+-0.039
pos      xgb   Accuracy 0.494+-0.052 F1 Score 0.454+-0.048  0.494+-0.052 | 0.454+-0.048



## Random Kfold

In [70]:
for clf_used, clf_name in zip(
    [clf_lr_1, clf_rf, clf_xgb],['lr C1', 'rf d5', 'xgb  ']
):
        
        
    for X_used, x_name in zip(
        [X_ngram_u, X_train_u, X_pos_u],
        ['ngrams  ', 'features', 'pos     ']
    ):
        out = run_experiment(X_used, y_train_u, cv_Kfold, clf_used, 0.03)
        print(
            x_name, 
            clf_name,
            f'Accuracy {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f}',
            f'F1 Score {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}',
            f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
        )

    print()

ngrams   lr C1 Accuracy 0.560+-0.045 F1 Score 0.488+-0.055  0.560+-0.045 | 0.488+-0.055
features lr C1 Accuracy 0.524+-0.052 F1 Score 0.519+-0.059  0.524+-0.052 | 0.519+-0.059
pos      lr C1 Accuracy 0.532+-0.037 F1 Score 0.508+-0.047  0.532+-0.037 | 0.508+-0.047

ngrams   rf d5 Accuracy 0.540+-0.030 F1 Score 0.438+-0.048  0.540+-0.030 | 0.438+-0.048
features rf d5 Accuracy 0.508+-0.069 F1 Score 0.528+-0.057  0.508+-0.069 | 0.528+-0.057
pos      rf d5 Accuracy 0.532+-0.045 F1 Score 0.443+-0.047  0.532+-0.045 | 0.443+-0.047

ngrams   xgb   Accuracy 0.511+-0.051 F1 Score 0.463+-0.060  0.511+-0.051 | 0.463+-0.060
features xgb   Accuracy 0.505+-0.049 F1 Score 0.498+-0.039  0.505+-0.049 | 0.498+-0.039
pos      xgb   Accuracy 0.531+-0.035 F1 Score 0.496+-0.047  0.531+-0.035 | 0.496+-0.047

