In [28]:
from polyglot.text import Text

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=7,progress_bar=True)

from parallelbar import progress_map

from utils import *

from nltk import ngrams
import nltk

import scipy
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
import xgboost as xgb

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report
)

from sentence_transformers import SentenceTransformer
# https://huggingface.co/sentence-transformers/LaBSE

from sklearn.decomposition import PCA

from sklearn import clone

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import random

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Read training data

In [38]:
X_emb = pd.read_hdf(r'../datasets/ready2use/pos_en_cz_embeddings.h5', key='stage', mode='r').iloc[19151:,:]

In [40]:
X_pos = pd.read_hdf('../datasets/ready2use/pos_en_cz_train.h5', key='stage', mode='r').iloc[19151:,:].reset_index(drop=True)
# header = pd.read_csv('../datasets/ready2use/pos_en_cz_train_header.csv', header=None, sep=';').values
# X_pos = pd.read_csv('../datasets/ready2use/pos_en_cz_train_data.csv', header=None, sep=';')
# X_pos.columns = header
y_train = X_pos['assestment']
X_pos = X_pos.drop('assestment', axis=1)

X_emb.index = X_pos.index

In [41]:
col_keep = []
for c in tqdm(X_pos.columns, position=0, leave=True):
    min_v = X_pos[c].values.min()
    max_v = X_pos[c].values.max()

    if min_v < max_v:
        r = scipy.stats.pearsonr(X_pos[c].values, y_train)[0]
        if ~np.isnan(r) and np.abs(r) > 0.03:
            col_keep.append(c)
            
len(col_keep)

100%|█████████████████████████████████████| 9219/9219 [00:01<00:00, 6481.96it/s]


108

In [42]:
y_train.value_counts()

1    15974
0     3340
Name: assestment, dtype: int64

### Load test data

In [43]:
X_emb_test = pd.read_csv('../datasets/ready2use/embeddings_pl_dataset.csv', sep=';', 
                         low_memory=False, header=None)

X_test = pd.read_csv('../datasets/ready2use/pos_pl_dataset.csv', sep=';')
y_test = X_test['assestment']
X_test = X_test.drop('assestment', axis=1)

X_emb_test.index = X_test.index

### Keep cols

In [44]:
col_keep_test = np.array(col_keep)[np.isin(col_keep, X_test.columns.values)]
col_keep_test.shape

(33,)

## Oversampling

In [46]:
n_0 = y_train.value_counts()[0]
n_1 = y_train.value_counts()[1]

n_lower = y_train.value_counts().min()
n_upper = y_train.value_counts().max()

np.random.seed(111)

# oversampling
if n_0 < n_1:
    index_0 = np.random.choice(y_train[y_train==0].index, n_1, replace=True)
    index_1 = np.random.choice(y_train[y_train==1].index, n_1, replace=False)
else:
    index_0 = np.random.choice(y_train[y_train==0].index, n_0, replace=False)
    index_1 = np.random.choice(y_train[y_train==1].index, n_0, replace=True)

# undersampling
# index_0 = np.random.choice(y_train[y_train==0].index, n_lower, replace=False)
# index_1 = np.random.choice(y_train[y_train==1].index, n_lower, replace=False)

y_train_u = y_train.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

X_pos_u = X_pos.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

In [49]:
# select columns from the undersamplet daatset
col_keep_u = []
for c in tqdm(X_pos_u.columns, position=0, leave=True):
    min_v = X_pos_u[c].values.min()
    max_v = X_pos_u[c].values.max()

    if min_v < max_v:
        r = scipy.stats.pearsonr(X_pos_u[c].values, y_train_u)[0]
        if ~np.isnan(r) and np.abs(r) > 0.03:
            col_keep_u.append(c)
            
print('Train', len(col_keep_u))

col_keep_test_u = np.array(col_keep_u)[np.isin(col_keep_u, X_test.columns.values)]
print('Test', col_keep_test_u.shape)

100%|█████████████████████████████████████| 9219/9219 [00:01<00:00, 5589.85it/s]


Train 207
Test (84,)


In [50]:
X_emb_u = X_emb.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

In [51]:
pca = PCA(n_components=100)
pca.fit(X_emb_u)

X_emb_pca = pca.transform(X_emb_u)
X_emb_test_pca = pca.transform(X_emb_test)

np.cumsum( pca.explained_variance_ )[-1]

0.4529606

In [54]:
y_train_u.value_counts()

1    15974
0    15974
Name: assestment, dtype: int64

## Cross validation of models

In [55]:
random.seed(111)

In [56]:
k = int( np.sqrt(y_train_u.shape[0]) )
k

178

In [58]:
import warnings
warnings.filterwarnings("ignore")

scoring = ['accuracy', 
           'precision', 'recall', 
           'f1']

clf_lr = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear')
clf_gnb = GaussianNB()
clf_knn = KNeighborsClassifier(n_neighbors=k)
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=111)
clf_svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', verbosity=0, silent=True, n_jobs=-1)

for c, n in zip(
    [clf_lr, 
     clf_gnb, 
     clf_knn,
     clf_rf, 
     clf_svm,
     clf_xgb, 
    ], 
    ['log_regC1', 
     'gauss_nb ', 
     f'knn_{k}  ', 
     'rand_frst',
     'svm_gamma', 
     'xgboost  ', 
    ]
):
    results = cross_validate(estimator=c,
                           X=X_pos_u[col_keep_test_u],
                           y=y_train_u,
                           cv=5,
                           scoring=scoring,
                           return_train_score=True)
    print(
        n,
        f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
#         f'Precision {results["test_precision"].mean():.3f}',
#         f'Recall {results["test_recall"].mean():.3f}',
        f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
        f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
        f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
    )

log_regC1 Accuracy 0.572+-0.003 F1 Score 0.599+-0.008           0.572+-0.003 | 0.599+-0.008
gauss_nb  Accuracy 0.538+-0.005 F1 Score 0.220+-0.004           0.538+-0.005 | 0.220+-0.004
knn_178   Accuracy 0.562+-0.008 F1 Score 0.598+-0.005           0.562+-0.008 | 0.598+-0.005
rand_frst Accuracy 0.579+-0.004 F1 Score 0.619+-0.009           0.579+-0.004 | 0.619+-0.009
svm_gamma Accuracy 0.687+-0.011 F1 Score 0.709+-0.006           0.687+-0.011 | 0.709+-0.006
xgboost   Accuracy 0.821+-0.011 F1 Score 0.826+-0.009           0.821+-0.011 | 0.826+-0.009


In [60]:
for c, n in zip(
    [clf_lr, 
     clf_gnb, 
     clf_knn,
     clf_rf, 
     clf_svm,
     clf_xgb, 
    ], 
    ['log_regC1', 
     'gauss_nb ', 
     f'knn_{k}  ', 
     'rand_frst',
     'svm_gamma', 
     'xgboost  ', 
    ]
):
    results = cross_validate(estimator=c,
                           X=X_emb_pca,
                           y=y_train_u,
                           cv=5,
                           scoring=scoring,
                           return_train_score=True)
    print(
        n,
        f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
#         f'Precision {results["test_precision"].mean():.3f}',
#         f'Recall {results["test_recall"].mean():.3f}',
        f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
        f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
        f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
    )

log_regC1 Accuracy 0.615+-0.005 F1 Score 0.614+-0.010           0.615+-0.005 | 0.614+-0.010
gauss_nb  Accuracy 0.610+-0.011 F1 Score 0.615+-0.010           0.610+-0.011 | 0.615+-0.010
knn_178   Accuracy 0.622+-0.004 F1 Score 0.628+-0.009           0.622+-0.004 | 0.628+-0.009
rand_frst Accuracy 0.631+-0.006 F1 Score 0.642+-0.017           0.631+-0.006 | 0.642+-0.017
svm_gamma Accuracy 0.966+-0.002 F1 Score 0.967+-0.001           0.966+-0.002 | 0.967+-0.001
xgboost   Accuracy 0.987+-0.001 F1 Score 0.987+-0.001           0.987+-0.001 | 0.987+-0.001


In [61]:
clf_v = VotingClassifier(estimators=[
    ('lr', clf_lr), 
    ('gnb', clf_gnb), 
    ('knn', clf_knn), 
    ('rf', clf_rf),
    ('svm', clf_svm),
    ('xgb', clf_xgb)
], voting='hard')

results = cross_validate(estimator=clf_v,
                       X=X_pos_u[col_keep_test_u],
                       y=y_train_u,
                       cv=5,
                       scoring=scoring,
                       return_train_score=True)
print(
    'voting',
    f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
    f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
    f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
    f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
)

voting Accuracy 0.663+-0.009 F1 Score 0.647+-0.010           0.663+-0.009 | 0.647+-0.010


In [62]:
results = cross_validate(estimator=clf_v,
                       X=X_emb_pca,
                       y=y_train_u,
                       cv=5,
                       scoring=scoring,
                       return_train_score=True)

print(
    'voting',
    f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
    f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
    f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
    f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
)

voting Accuracy 0.753+-0.006 F1 Score 0.742+-0.010           0.753+-0.006 | 0.742+-0.010


## Test on PL data

## Voting

In [63]:
clf_out = clone(clf_xgb)
clf_emb = clone(clf_xgb)

### train -> test

In [64]:
clf_out.fit(X_pos_u[col_keep_test_u], y_train_u)

In [65]:
clf_emb.fit(X_emb_pca, y_train_u)

In [66]:
y_pred = clf_out.predict(X_test[col_keep_test_u])
y_pred_e = clf_emb.predict(X_emb_test_pca)

In [67]:
confusion_matrix(y_test, y_pred)

array([[ 173, 3265],
       [ 207, 2896]])

In [68]:
confusion_matrix(y_test, y_pred_e)

array([[ 194, 3244],
       [ 242, 2861]])

In [69]:
print(
    n, '\n\t',
    f'Accuracy  {accuracy_score(y_test, y_pred):.3f}\n\t',
    f'Precision {precision_score(y_test, y_pred):.3f}\n\t',
    f'Recall    {recall_score(y_test, y_pred):.3f}\n\t',
    f'F1 Score  {f1_score(y_test, y_pred):.3f}\n\t',
)

xgboost   
	 Accuracy  0.469
	 Precision 0.470
	 Recall    0.933
	 F1 Score  0.625
	


In [70]:
print(
    n, '\n\t',
    f'Accuracy  {accuracy_score(y_test, y_pred_e):.3f}\n\t',
    f'Precision {precision_score(y_test, y_pred_e):.3f}\n\t',
    f'Recall    {recall_score(y_test, y_pred_e):.3f}\n\t',
    f'F1 Score  {f1_score(y_test, y_pred_e):.3f}\n\t',
)

xgboost   
	 Accuracy  0.467
	 Precision 0.469
	 Recall    0.922
	 F1 Score  0.621
	
