In [1]:
from polyglot.text import Text

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=7,progress_bar=True)

from parallelbar import progress_map

from utils import *

from nltk import ngrams
import nltk

import scipy
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
import xgboost as xgb

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report
)

from sentence_transformers import SentenceTransformer
# https://huggingface.co/sentence-transformers/LaBSE

from sklearn.decomposition import PCA

from sklearn import clone

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import random

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.




## POS CZ data

In [2]:
def extract_features_cz(txt):
    
    doc = Text(txt, hint_language_code='cs')

    txt_pos = []
    
    for token in doc.pos_tags:
        if token[1] not in ['SPACE', 'PUNCT']:
            txt_pos.append(token[1])            
    
    return txt_pos

In [3]:
if False:
    df_cz_cz = pd.read_csv('../datasets/demagog_nlp_cz/converted-exp-CZ.tsv', sep='\t')

    df_cz_cz['text_clean'] = df_cz_cz['statementText'].apply(lambda x: clean_przyp(x))

    tasks = df_cz_cz['text_clean'].values.tolist()
    result = progress_map(extract_features_cz, tasks, n_cpu=7, chunk_size=1, core_progress=True)
    
    df_cz_cz['TEXT_POS'] = result

    df_cz_cz['TEXT_POS'] = df_cz_cz['TEXT_POS'].str.join(" ")
    
    df_cz_cz.to_csv('../datasets/ready2use/fake_news_features_cz_CZ.csv', sep=';', index=False, encoding='utf8')
else:
    df_cz_cz = pd.read_csv('../datasets/ready2use/fake_news_features_cz_CZ.csv', sep=';')

In [4]:
df_cz_cz.shape

(9082, 10)

### The same code for SK data

In [5]:
if False:
    df_cz_sk = pd.read_csv('../datasets/demagog_nlp_cz/converted-exp-SK.tsv', sep='\t')

    df_cz_sk['text_clean'] = df_cz_sk['statementText'].apply(lambda x: clean_przyp(x))

    tasks = df_cz_sk['text_clean'].values.tolist()
    result = progress_map(extract_features_cz, tasks, n_cpu=7, chunk_size=1, core_progress=True)
    
    df_cz_sk['TEXT_POS'] = result

    df_cz_sk['TEXT_POS'] = df_cz_sk['TEXT_POS'].str.join(" ")
    
    df_cz_sk.to_csv('../datasets/ready2use/fake_news_features_cz_SK.csv', sep=';', index=False, encoding='utf8')
else:
    df_cz_sk = pd.read_csv('../datasets/ready2use/fake_news_features_cz_SK.csv', sep=';')

In [6]:
df_cz_sk.shape

(12554, 10)

In [7]:
df_cz = pd.concat([df_cz_sk, df_cz_sk])
df_cz.shape

(25108, 10)

In [8]:
df_cz['statementState'] = df_cz['statementState'].str.strip()

df_cz = df_cz[ df_cz['statementState'] != 'MISLEADING' ]
df_cz = df_cz[ df_cz['statementState'] != 'UNVERIFIABLE' ]
df_cz = df_cz[ df_cz['statementState'] != 'null' ]

df_cz = df_cz.reset_index(drop=True)

df_cz['assestment'] = df_cz['statementState'].replace({
    'FALSE' : 0,
#     'Manipulacja' : 1,
    'TRUE' : 1
}).astype(int)

In [9]:
df_cz['assestment'].value_counts()

1    15974
0     3340
Name: assestment, dtype: int64

## POS ENG data

In [10]:
nlp_core_en = spacy.load("en_core_web_lg")
def extract_features_en(txt, nlp_core=nlp_core_en):
    
    doc = nlp_core(txt)
    
    txt_pos = []
    for token in doc:
        if token.pos_ not in ['SPACE', 'PUNCT']:
            txt_pos.append(token.pos_)            
    
    return txt_pos

In [11]:
df_en = pd.read_csv('../datasets/politifact/politifact.csv', sep=',', index_col=0)

df_en.loc[:, 'fact'] = df_en['fact'].replace({
    'half-true' : 'true',
    'mostly-true' : 'true',
    'barely-true' : 'false',
    'pants-fire' : 'false',
})

df_en = df_en[df_en['fact'].isin(['true', 'false'])]

In [12]:
df_en.shape

(19151, 11)

In [13]:
df_en = df_en[['sources_quote', 'fact']]
df_en.columns  = [
    'statement',
    'label'
]

df_en['text_clean'] = df_en['statement'].apply(lambda x: clean_przyp(x))

tasks = df_en['text_clean'].values.tolist()
result = progress_map(extract_features_en, tasks, n_cpu=7, chunk_size=1, core_progress=True)

df_en['TEXT_POS'] = result

df_en['TEXT_POS'] = df_en['TEXT_POS'].str.join(" ")

df_en['assestment'] = df_en['label'].replace({
    'false' : 0,
#     'Manipulacja' : 1,
    'true' : 1
}).astype(int)

Core 1:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 2:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 3:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 4:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 5:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 6:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 7:   0%|          | 0/2736 [00:00<?, ?it/s]

In [14]:
# df_en = pd.concat(
# [
#     pd.read_csv('../datasets/liar/train.tsv', sep='\t', header=None),
#     pd.read_csv('../datasets/liar/valid.tsv', sep='\t', header=None),
#     pd.read_csv('../datasets/liar/test.tsv', sep='\t', header=None)
# ]
# ).iloc[:,:3]

# df_en.columns = [
#     'id',
#     'label',
#     'statement'
# ]

# df_en.loc[:, 'label'] = df_en['label'].replace({
#     'half-true' : 'true',
#     'mostly-true' : 'true',
#     'barely-true' : 'false',
#     'pants-fire' : 'false',
# })

## Use EN/CZ data as training

In [15]:
df_en.columns

Index(['statement', 'label', 'text_clean', 'TEXT_POS', 'assestment'], dtype='object')

In [16]:
df_cz.columns

Index(['politicianID', 'name', 'party', 'statementID', 'statementText',
       'statementState', 'statementExplanClean', 'statementExplan    ',
       'text_clean', 'TEXT_POS', 'assestment'],
      dtype='object')

In [17]:
df_all = pd.concat([df_en[['assestment', 'TEXT_POS', 'text_clean']], 
                    df_cz[['assestment', 'TEXT_POS', 'text_clean']]])

In [18]:
df_all['assestment'].value_counts()

1    25033
0    13432
Name: assestment, dtype: int64

In [19]:
y_train = df_all.copy()['assestment']
X_train = df_all.copy()[['TEXT_POS']]

In [20]:
n_grams = 5
min_pos = 5 

In [25]:
X_pos = X_train[['TEXT_POS']].copy()

words =  sum(X_pos['TEXT_POS'].str.split(' ').values.tolist(), [])

n_list = []
for n in range(n_grams):
    n_i = pd.Series(nltk.ngrams(words, n+1)).value_counts()
    n_i = n_i[n_i>min_pos]
    n_list.append(n_i)

n_iterator = []
for n_i in n_list:
    n_iterator += n_i.index.tolist()

In [27]:
def get_pos_features_para(n):
    x = X_pos['TEXT_POS'].str.count(' '.join(n)) / X_pos['TEXT_POS'].str.split(' ').str.len()
    min_v = x.min()
    max_v = x.max()

    if min_v < max_v:
        r = scipy.stats.pearsonr(x, y_train)[0]
        if ~np.isnan(r) and np.abs(r) > 0.01: 
            return x
    
    return None

result = progress_map(get_pos_features_para, n_iterator, n_cpu=7, chunk_size=1, core_progress=True)

Core 1:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 2:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 3:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 4:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 5:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 6:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 7:   0%|          | 0/3911 [00:00<?, ?it/s]

In [28]:
col = {}
    
for x, n in tqdm(zip(result, n_iterator)):
    if x is not None:
        col[' '.join(n)] = x
        col[' '.join(n)].name = ' '.join(n)








27373it [00:00, 880686.41it/s][A[A[A


In [29]:
X_pos = pd.concat( [X_pos] + list( col.values() ), axis=1 ).drop('TEXT_POS', axis=1)

In [30]:
y_train.reset_index(drop=True, inplace=True)
X_pos.reset_index(drop=True, inplace=True)

In [31]:
out_pos = X_pos.join(y_train)

In [32]:
X_pos.shape

(38465, 9219)

In [33]:
out_pos.to_hdf(r'../datasets/ready2use/pos_en_cz_train.h5', key='stage', mode='w')

## Create sentences embeddings

In [26]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
#'sentence-transformers/distiluse-base-multilingual-cased-v2')
#'sentence-transformers/LaBSE')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [27]:
def get_emebeddings(x):
    embedding = model.encode(x)
    return embedding

In [29]:
result = [ get_emebeddings(x) for x in tqdm(df_all['text_clean'].values, position=0, leave=True) ]

100%|█████████████████████████████████████| 38465/38465 [09:06<00:00, 70.37it/s]


In [30]:
# result = progress_map(get_emebeddings, df_all['text_clean'].values, 
#                       n_cpu=7, chunk_size=1, core_progress=True)

In [42]:
df_emb = df_all[['text_clean']].reset_index(drop=True)
df_emb['emb'] = result #f_all['text_clean'].progress_apply(lambda x: get_emebeddings(x))
# [ get_emebeddings(x) for x in tqdm(df_all['text_clean'].values, position=0, leave=True) ]

In [43]:
df_emb_full = pd.DataFrame(df_emb['emb'].tolist(), index= df_emb.index)

In [44]:
df_emb_full.to_hdf(r'../datasets/ready2use/pos_en_cz_embeddings_2.h5', key='stage', mode='w')

In [35]:
# np.savetxt('../datasets/ready2use/pos_en_cz_train_header.csv', out_pos.columns.values, delimiter=";", fmt="%s")
# np.savetxt('../datasets/ready2use/pos_en_cz_train_data.csv', out_pos.values, delimiter=";")

### Read training data

In [46]:
X_emb = pd.read_hdf(r'../datasets/ready2use/pos_en_cz_embeddings_2.h5', key='stage', mode='r')

In [47]:
X_pos = pd.read_hdf('../datasets/ready2use/pos_en_cz_train.h5', key='stage', mode='r')
# header = pd.read_csv('../datasets/ready2use/pos_en_cz_train_header.csv', header=None, sep=';').values
# X_pos = pd.read_csv('../datasets/ready2use/pos_en_cz_train_data.csv', header=None, sep=';')
# X_pos.columns = header
y_train = X_pos['assestment']
X_pos = X_pos.drop('assestment', axis=1)

X_emb.index = X_pos.index

In [48]:
# col_keep = []
# for c in tqdm(X_pos.columns, position=0, leave=True):
#     min_v = X_pos[c].values.min()
#     max_v = X_pos[c].values.max()

#     if min_v < max_v:
#         r = scipy.stats.pearsonr(X_pos[c].values, y_train)[0]
#         if ~np.isnan(r) and np.abs(r) > 0.1:
#             col_keep.append(c)
            
# len(col_keep)

### Load test data

In [53]:
if True:
    df_test_emb = pd.read_csv('../datasets/ready2use/text_celan_pl_dataset.csv', sep=';', header=None)
    df_test_emb.columns = ['index', 'text_clean', 'label']

    result = [ get_emebeddings(x) for x in tqdm(df_test_emb['text_clean'].values, position=0, leave=True) ]
    
    X_emb_test = pd.DataFrame(result, index = df_test_emb.index)
    
    X_emb_test.to_csv('../datasets/ready2use/embeddings_pl_dataset_2.csv', sep=';', header=None, index=False)
    
    y_test = df_test_emb['label'].values

100%|███████████████████████████████████████| 6542/6542 [01:17<00:00, 84.28it/s]


In [54]:
# X_emb_test = pd.read_csv('../datasets/ready2use/embeddings_pl_dataset_2.csv', sep=';', 
#                          low_memory=False, header=None)

# X_test = pd.read_csv('../datasets/ready2use/pos_pl_dataset.csv', sep=';')
# y_test = X_test['assestment']
# X_test = X_test.drop('assestment', axis=1)

# # X_emb_test.index = X_test.index

### Keep cols

In [55]:
# col_keep_test = np.array(col_keep)[np.isin(col_keep, X_test.columns.values)]
# col_keep_test.shape

## Undersampling

In [56]:
n_lower = y_train.value_counts().min()
n_upper = y_train.value_counts().max()

np.random.seed(111)

# undersampling    
index_0 = np.random.choice(y_train[y_train==0].index, n_lower, replace=False)
index_1 = np.random.choice(y_train[y_train==1].index, n_lower, replace=False)

y_train_u = y_train.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

# X_pos_u = X_pos.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

# # select columns from the undersamplet daatset
# col_keep_u = []
# for c in tqdm(X_pos_u.columns, position=0, leave=True):
#     min_v = X_pos_u[c].values.min()
#     max_v = X_pos_u[c].values.max()

#     if min_v < max_v:
#         r = scipy.stats.pearsonr(X_pos_u[c].values, y_train_u)[0]
#         if ~np.isnan(r) and np.abs(r) > 0.1:
#             col_keep_u.append(c)
            
# print('Train', len(col_keep_u))

# col_keep_test_u = np.array(col_keep_u)[np.isin(col_keep_u, X_test.columns.values)]
# print('Test', col_keep_test_u.shape)

In [57]:
X_emb_u = X_emb.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

In [69]:
pca = PCA(n_components=100)
pca.fit(X_emb_u)

X_emb_pca = pca.transform(X_emb_u)
X_emb_test_pca = pca.transform(X_emb_test)

np.cumsum( pca.explained_variance_ )[-1]

0.564135

### Can create weights of dataset

In [70]:
y_train.value_counts()

1    25033
0    13432
Name: assestment, dtype: int64

In [71]:
y_train.shape

(38465,)

In [72]:
y_train_u.shape

(26864,)

In [73]:
class_weight = {
    1 : 1,
    0 : 1 # n_upper/n_lower
}

## Cross validation of models

In [74]:
random.seed(111)

In [75]:
k = int( np.sqrt(y_train_u.shape[0]) )
k

163

In [76]:
import warnings
warnings.filterwarnings("ignore")

scoring = ['accuracy', 
           'precision', 'recall', 
           'f1']

clf_lr = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear', 
                            class_weight=class_weight)
clf_gnb = GaussianNB()
clf_knn = KNeighborsClassifier(n_neighbors=k)
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=111)
clf_svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', verbosity=0, silent=True, n_jobs=-1)

# for c, n in zip(
#     [clf_lr, 
#      clf_gnb, 
#      clf_knn,
#      clf_rf, 
#      clf_svm,
#      clf_xgb, 
#     ], 
#     ['log_regC1', 
#      'gauss_nb ', 
#      f'knn_{k}  ', 
#      'rand_frst',
#      'svm_gamma', 
#      'xgboost  ', 
#     ]
# ):
#     results = cross_validate(estimator=c,
#                            X=X_pos_u[col_keep_test_u],
#                            y=y_train_u,
#                            cv=5,
#                            scoring=scoring,
#                            return_train_score=True)
#     print(
#         n,
#         f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
# #         f'Precision {results["test_precision"].mean():.3f}',
# #         f'Recall {results["test_recall"].mean():.3f}',
#         f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
#         f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
#         f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
#     )

In [77]:
for c, n in zip(
    [clf_lr, 
     clf_gnb, 
     clf_knn,
     clf_rf, 
     clf_svm,
     clf_xgb, 
    ], 
    ['log_regC1', 
     'gauss_nb ', 
     f'knn_{k}  ', 
     'rand_frst',
     'svm_gamma', 
     'xgboost  ', 
    ]
):
    results = cross_validate(estimator=c,
                           X=X_emb_pca,
                           y=y_train_u,
                           cv=5,
                           scoring=scoring,
                           return_train_score=True)
    print(
        n,
        f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
#         f'Precision {results["test_precision"].mean():.3f}',
#         f'Recall {results["test_recall"].mean():.3f}',
        f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
        f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
        f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
    )

log_regC1 Accuracy 0.624+-0.123 F1 Score 0.519+-0.317           0.624+-0.123 | 0.519+-0.317
gauss_nb  Accuracy 0.669+-0.178 F1 Score 0.552+-0.350           0.669+-0.178 | 0.552+-0.350
knn_163   Accuracy 0.638+-0.137 F1 Score 0.525+-0.325           0.638+-0.137 | 0.525+-0.325
rand_frst Accuracy 0.674+-0.184 F1 Score 0.555+-0.356           0.674+-0.184 | 0.555+-0.356
svm_gamma Accuracy 0.616+-0.115 F1 Score 0.514+-0.313           0.616+-0.115 | 0.514+-0.313
xgboost   Accuracy 0.624+-0.109 F1 Score 0.547+-0.276           0.624+-0.109 | 0.547+-0.276


In [81]:
for c, n in zip(
    [clf_lr, 
     clf_gnb, 
     clf_knn,
     clf_rf, 
     clf_svm,
     clf_xgb, 
    ], 
    ['log_regC1', 
     'gauss_nb ', 
     f'knn_{k}  ', 
     'rand_frst',
     'svm_gamma', 
     'xgboost  ', 
    ]
):
    results = cross_validate(estimator=c,
                           X=X_emb_u,
                           y=y_train_u,
                           cv=5,
                           scoring=scoring,
                           return_train_score=True)
    print(
        n,
        f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
#         f'Precision {results["test_precision"].mean():.3f}',
#         f'Recall {results["test_recall"].mean():.3f}',
        f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
        f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
        f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
    )

log_regC1 Accuracy 0.612+-0.110 F1 Score 0.514+-0.306           0.612+-0.110 | 0.514+-0.306
gauss_nb  Accuracy 0.693+-0.207 F1 Score 0.572+-0.373           0.693+-0.207 | 0.572+-0.373
knn_163   Accuracy 0.631+-0.128 F1 Score 0.521+-0.320           0.631+-0.128 | 0.521+-0.320
rand_frst Accuracy 0.693+-0.207 F1 Score 0.571+-0.373           0.693+-0.207 | 0.571+-0.373
svm_gamma Accuracy 0.612+-0.111 F1 Score 0.511+-0.311           0.612+-0.111 | 0.511+-0.311
xgboost   Accuracy 0.640+-0.114 F1 Score 0.554+-0.287           0.640+-0.114 | 0.554+-0.287


In [82]:
clf_v = VotingClassifier(estimators=[
    ('lr', clf_lr), 
    ('gnb', clf_gnb), 
    ('knn', clf_knn), 
    ('rf', clf_rf),
    ('svm', clf_svm),
    ('xgb', clf_xgb)
], voting='hard')

# results = cross_validate(estimator=clf_v,
#                        X=X_pos_u[col_keep_test_u],
#                        y=y_train_u,
#                        cv=5,
#                        scoring=scoring,
#                        return_train_score=True)
# print(
#     'voting',
#     f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
#     f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
#     f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
#     f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
# )

In [83]:
results = cross_validate(estimator=clf_v,
                       X=X_emb_pca,
                       y=y_train_u,
                       cv=5,
                       scoring=scoring,
                       return_train_score=True)

print(
    'voting',
    f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
    f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
    f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
    f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
)

voting Accuracy 0.661+-0.167 F1 Score 0.544+-0.345           0.661+-0.167 | 0.544+-0.345


In [84]:
results = cross_validate(estimator=clf_v,
                       X=X_emb_u,
                       y=y_train_u,
                       cv=5,
                       scoring=scoring,
                       return_train_score=True)

print(
    'voting',
    f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
    f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
    f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
    f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
)

voting Accuracy 0.664+-0.170 F1 Score 0.547+-0.347           0.664+-0.170 | 0.547+-0.347


## Test on PL data

## Voting

In [88]:
clf_out = clone(clf_v)
clf_emb = clone(clf_v)
clf_pca = clone(clf_v)

### train -> test

In [89]:
# clf_out.fit(X_pos_u[col_keep_test_u], y_train_u)

In [90]:
clf_pca.fit(X_emb_pca, y_train_u)
clf_emb.fit(X_emb_u, y_train_u)

In [91]:
# y_pred = clf_out.predict(X_test[col_keep_test_u])
y_pred_e = clf_emb.predict(X_emb_test)
y_pred_p = clf_pca.predict(X_emb_test_pca)

In [97]:
confusion_matrix(y_test, y_pred_p)

array([[ 122, 3317],
       [ 114, 2989]])

In [98]:
confusion_matrix(y_test, y_pred_e)

array([[   7, 3432],
       [  15, 3088]])

In [94]:
# print(
#     n, '\n\t',
#     f'Accuracy  {accuracy_score(y_test, y_pred):.3f}\n\t',
#     f'Precision {precision_score(y_test, y_pred):.3f}\n\t',
#     f'Recall    {recall_score(y_test, y_pred):.3f}\n\t',
#     f'F1 Score  {f1_score(y_test, y_pred):.3f}\n\t',
# )

In [95]:
print(
    n, '\n\t',
    f'Accuracy  {accuracy_score(y_test, y_pred_p):.3f}\n\t',
    f'Precision {precision_score(y_test, y_pred_p):.3f}\n\t',
    f'Recall    {recall_score(y_test, y_pred_p):.3f}\n\t',
    f'F1 Score  {f1_score(y_test, y_pred_p):.3f}\n\t',
)

xgboost   
	 Accuracy  0.476
	 Precision 0.474
	 Recall    0.963
	 F1 Score  0.635
	


In [96]:
print(
    n, '\n\t',
    f'Accuracy  {accuracy_score(y_test, y_pred_e):.3f}\n\t',
    f'Precision {precision_score(y_test, y_pred_e):.3f}\n\t',
    f'Recall    {recall_score(y_test, y_pred_e):.3f}\n\t',
    f'F1 Score  {f1_score(y_test, y_pred_e):.3f}\n\t',
)

xgboost   
	 Accuracy  0.473
	 Precision 0.474
	 Recall    0.995
	 F1 Score  0.642
	
