In [78]:
from polyglot.text import Text

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=7,progress_bar=True)

from parallelbar import progress_map

from utils import *

from nltk import ngrams
import nltk

import scipy
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
import xgboost as xgb

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report
)

from sentence_transformers import SentenceTransformer
# https://huggingface.co/sentence-transformers/LaBSE

from sklearn.decomposition import PCA

from sklearn import clone

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import random

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## POS CZ data

In [46]:
def extract_features_cz(txt):
    
    doc = Text(txt, hint_language_code='cs')

    txt_pos = []
    
    for token in doc.pos_tags:
        if token[1] not in ['SPACE', 'PUNCT']:
            txt_pos.append(token[1])            
    
    return txt_pos

In [47]:
if False:
    df_cz_cz = pd.read_csv('../datasets/demagog_nlp_cz/converted-exp-CZ.tsv', sep='\t')

    df_cz_cz['text_clean'] = df_cz_cz['statementText'].apply(lambda x: clean_przyp(x))

    tasks = df_cz_cz['text_clean'].values.tolist()
    result = progress_map(extract_features_cz, tasks, n_cpu=7, chunk_size=1, core_progress=True)
    
    df_cz_cz['TEXT_POS'] = result

    df_cz_cz['TEXT_POS'] = df_cz_cz['TEXT_POS'].str.join(" ")
    
    df_cz_cz.to_csv('../datasets/ready2use/fake_news_features_cz_CZ.csv', sep=';', index=False, encoding='utf8')
else:
    df_cz_cz = pd.read_csv('../datasets/ready2use/fake_news_features_cz_CZ.csv', sep=';')

### The same code for SK data

In [48]:
if False:
    df_cz_sk = pd.read_csv('../datasets/demagog_nlp_cz/converted-exp-SK.tsv', sep='\t')

    df_cz_sk['text_clean'] = df_cz_sk['statementText'].apply(lambda x: clean_przyp(x))

    tasks = df_cz_sk['text_clean'].values.tolist()
    result = progress_map(extract_features_cz, tasks, n_cpu=7, chunk_size=1, core_progress=True)
    
    df_cz_sk['TEXT_POS'] = result

    df_cz_sk['TEXT_POS'] = df_cz_sk['TEXT_POS'].str.join(" ")
    
    df_cz_sk.to_csv('../datasets/ready2use/fake_news_features_cz_SK.csv', sep=';', index=False, encoding='utf8')
else:
    df_cz_sk = pd.read_csv('../datasets/ready2use/fake_news_features_cz_SK.csv', sep=';')

In [49]:
df_cz_sk.shape

(12554, 10)

In [50]:
df_cz_cz.shape

(9082, 10)

In [51]:
df_cz = pd.concat([df_cz_sk, df_cz_sk])
df_cz.shape

(25108, 10)

In [52]:
df_cz['statementState'] = df_cz['statementState'].str.strip()

df_cz = df_cz[ df_cz['statementState'] != 'MISLEADING' ]
df_cz = df_cz[ df_cz['statementState'] != 'UNVERIFIABLE' ]
df_cz = df_cz[ df_cz['statementState'] != 'null' ]

df_cz = df_cz.reset_index(drop=True)

df_cz['assestment'] = df_cz['statementState'].replace({
    'FALSE' : 0,
#     'Manipulacja' : 1,
    'TRUE' : 1
}).astype(int)

In [53]:
df_cz['assestment'].value_counts()

1    15974
0     3340
Name: assestment, dtype: int64

## POS ENG data

In [54]:
nlp_core_en = spacy.load("en_core_web_lg")
def extract_features_en(txt, nlp_core=nlp_core_en):
    
    doc = nlp_core(txt)
    
    txt_pos = []
    for token in doc:
        if token.pos_ not in ['SPACE', 'PUNCT']:
            txt_pos.append(token.pos_)            
    
    return txt_pos

In [55]:
df_en = pd.read_csv('../datasets/politifact/politifact.csv', sep=',', index_col=0)

df_en.loc[:, 'fact'] = df_en['fact'].replace({
    'half-true' : 'true',
    'mostly-true' : 'true',
    'barely-true' : 'false',
    'pants-fire' : 'false',
})

df_en = df_en[df_en['fact'].isin(['true', 'false'])]

df_en = df_en[['sources_quote', 'fact']]
df_en.columns  = [
    'statement',
    'label'
]

df_en['text_clean'] = df_en['statement'].apply(lambda x: clean_przyp(x))

tasks = df_en['text_clean'].values.tolist()
result = progress_map(extract_features_en, tasks, n_cpu=7, chunk_size=1, core_progress=True)

df_en['TEXT_POS'] = result

df_en['TEXT_POS'] = df_en['TEXT_POS'].str.join(" ")

df_en['assestment'] = df_en['label'].replace({
    'false' : 0,
#     'Manipulacja' : 1,
    'true' : 1
}).astype(int)

Core 1:   0%|          | 0/2736 [00:01<?, ?it/s]

Core 2:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 3:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 4:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 5:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 6:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 7:   0%|          | 0/2736 [00:00<?, ?it/s]

In [56]:
# df_en = pd.concat(
# [
#     pd.read_csv('../datasets/liar/train.tsv', sep='\t', header=None),
#     pd.read_csv('../datasets/liar/valid.tsv', sep='\t', header=None),
#     pd.read_csv('../datasets/liar/test.tsv', sep='\t', header=None)
# ]
# ).iloc[:,:3]

# df_en.columns = [
#     'id',
#     'label',
#     'statement'
# ]

# df_en.loc[:, 'label'] = df_en['label'].replace({
#     'half-true' : 'true',
#     'mostly-true' : 'true',
#     'barely-true' : 'false',
#     'pants-fire' : 'false',
# })

## Use EN/CZ data as training

In [57]:
df_en.columns

Index(['statement', 'label', 'text_clean', 'TEXT_POS', 'assestment'], dtype='object')

In [58]:
df_cz.columns

Index(['politicianID', 'name', 'party', 'statementID', 'statementText',
       'statementState', 'statementExplanClean', 'statementExplan    ',
       'text_clean', 'TEXT_POS', 'assestment'],
      dtype='object')

In [61]:
df_all = pd.concat([df_en[['assestment', 'TEXT_POS', 'text_clean']], 
                    df_cz[['assestment', 'TEXT_POS', 'text_clean']]])

In [62]:
df_all['assestment'].value_counts()

1    25033
0    13432
Name: assestment, dtype: int64

In [63]:
y_train = df_all.copy()['assestment']
X_train = df_all.copy()[['TEXT_POS']]

In [64]:
# y_train = df_cz.copy()['assestment']
# X_train = df_cz.copy().loc[:, df_cz.columns != 'assestment']

In [24]:
n_grams = 5
min_pos = 5 

In [25]:
X_pos = X_train[['TEXT_POS']].copy()

words =  sum(X_pos['TEXT_POS'].str.split(' ').values.tolist(), [])

n_list = []
for n in range(n_grams):
    n_i = pd.Series(nltk.ngrams(words, n+1)).value_counts()
    n_i = n_i[n_i>min_pos]
    n_list.append(n_i)

n_iterator = []
for n_i in n_list:
    n_iterator += n_i.index.tolist()

In [27]:
def get_pos_features_para(n):
    x = X_pos['TEXT_POS'].str.count(' '.join(n)) / X_pos['TEXT_POS'].str.split(' ').str.len()
    min_v = x.min()
    max_v = x.max()

    if min_v < max_v:
        r = scipy.stats.pearsonr(x, y_train)[0]
        if ~np.isnan(r) and np.abs(r) > 0.01: 
            return x
    
    return None

result = progress_map(get_pos_features_para, n_iterator, n_cpu=7, chunk_size=1, core_progress=True)

Core 1:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 2:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 3:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 4:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 5:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 6:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 7:   0%|          | 0/3911 [00:00<?, ?it/s]

In [28]:
col = {}
    
for x, n in tqdm(zip(result, n_iterator)):
    if x is not None:
        col[' '.join(n)] = x
        col[' '.join(n)].name = ' '.join(n)








27373it [00:00, 880686.41it/s][A[A[A


In [29]:
X_pos = pd.concat( [X_pos] + list( col.values() ), axis=1 ).drop('TEXT_POS', axis=1)

In [30]:
y_train.reset_index(drop=True, inplace=True)
X_pos.reset_index(drop=True, inplace=True)

In [31]:
out_pos = X_pos.join(y_train)

In [32]:
X_pos.shape

(38465, 9219)

In [33]:
out_pos.to_hdf(r'../datasets/ready2use/pos_en_cz_train.h5', key='stage', mode='w')

## Create sentences embeddings

In [14]:
model = SentenceTransformer('sentence-transformers/LaBSE')



In [15]:
def get_emebeddings(x):
    embedding = model.encode(x)
    return embedding

In [None]:
result = progress_map(get_emebeddings, df_all['text_clean'].values, 
                      n_cpu=7, chunk_size=1, core_progress=True)

In [71]:
df_emb = df_all[['text_clean']]
df_emb['emb'] = result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emb['emb'] = result


In [75]:
df_emb_full = pd.DataFrame(df_emb['emb'].tolist(), index= df_emb.index)

In [76]:
df_emb_full.to_hdf(r'../datasets/ready2use/pos_en_cz_embeddings.h5', key='stage', mode='w')

In [34]:
# np.savetxt('../datasets/ready2use/pos_en_cz_train_header.csv', out_pos.columns.values, delimiter=";", fmt="%s")
# np.savetxt('../datasets/ready2use/pos_en_cz_train_data.csv', out_pos.values, delimiter=";")

### Read training data

In [2]:
X_emb = pd.read_hdf(r'../datasets/ready2use/pos_en_cz_embeddings.h5', key='stage', mode='r')

In [3]:
X_pos = pd.read_hdf('../datasets/ready2use/pos_en_cz_train.h5', key='stage', mode='r')
# header = pd.read_csv('../datasets/ready2use/pos_en_cz_train_header.csv', header=None, sep=';').values
# X_pos = pd.read_csv('../datasets/ready2use/pos_en_cz_train_data.csv', header=None, sep=';')
# X_pos.columns = header
y_train = X_pos['assestment']
X_pos = X_pos.drop('assestment', axis=1)

X_emb.index = X_pos.index

In [4]:
col_keep = []
for c in tqdm(X_pos.columns, position=0, leave=True):
    min_v = X_pos[c].values.min()
    max_v = X_pos[c].values.max()

    if min_v < max_v:
        r = scipy.stats.pearsonr(X_pos[c].values, y_train)[0]
        if ~np.isnan(r) and np.abs(r) > 0.1:
            col_keep.append(c)
            
len(col_keep)

100%|█████████████████████████████████████| 9219/9219 [00:03<00:00, 2442.69it/s]


87

### Load test data

In [17]:
if False:
    df_test_emb = pd.read_csv('../datasets/ready2use/text_celan_pl_dataset.csv', sep=';', header=None)
    df_test_emb.columns = ['text_clean']

    result = progress_map(get_emebeddings, df_test_emb['text_clean'].values, 
                          n_cpu=7, chunk_size=1, core_progress=True)

    X_emb_test = pd.DataFrame(result, index = df_test_emb.index)
    
    X_emb_test.to_csv('../datasets/ready2use/embeddings_pl_dataset.csv', sep=';', header=None, index=False)

In [18]:
X_emb_test = pd.read_csv('../datasets/ready2use/embeddings_pl_dataset.csv', sep=';', 
                         low_memory=False, header=None)

X_test = pd.read_csv('../datasets/ready2use/pos_pl_dataset.csv', sep=';')
y_test = X_test['assestment']
X_test = X_test.drop('assestment', axis=1)

X_emb_test.index = X_test.index

### Keep cols

In [19]:
col_keep_test = np.array(col_keep)[np.isin(col_keep, X_test.columns.values)]
col_keep_test.shape

(69,)

## Undersampling

In [82]:
n_lower = y_train.value_counts().min()
n_upper = y_train.value_counts().max()

np.random.seed(111)

# undersampling    
index_0 = np.random.choice(y_train[y_train==0].index, n_lower, replace=False)
index_1 = np.random.choice(y_train[y_train==1].index, n_lower, replace=False)

y_train_u = y_train.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

X_pos_u = X_pos.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

# select columns from the undersamplet daatset
col_keep_u = []
for c in tqdm(X_pos_u.columns, position=0, leave=True):
    min_v = X_pos_u[c].values.min()
    max_v = X_pos_u[c].values.max()

    if min_v < max_v:
        r = scipy.stats.pearsonr(X_pos_u[c].values, y_train_u)[0]
        if ~np.isnan(r) and np.abs(r) > 0.1:
            col_keep_u.append(c)
            
print('Train', len(col_keep_u))

col_keep_test_u = np.array(col_keep_u)[np.isin(col_keep_u, X_test.columns.values)]
print('Test', col_keep_test_u.shape)

100%|█████████████████████████████████████| 9219/9219 [00:03<00:00, 2398.83it/s]


Train 100
Test (73,)


In [83]:
X_emb_u = X_emb.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

In [84]:
pca = PCA(n_components=100)
pca.fit(X_emb_u)

X_emb_pca = pca.transform(X_emb_u)
X_emb_test_pca = pca.transform(X_emb_test)

np.cumsum( pca.explained_variance_ )[-1]

0.522934

### Can create weights of dataset

In [85]:
y_train.value_counts()

1    25033
0    13432
Name: assestment, dtype: int64

In [86]:
y_train.shape

(38465,)

In [87]:
y_train_u.shape

(26864,)

In [88]:
class_weight = {
    1 : 1,
    0 : 1 # n_upper/n_lower
}

## Cross validation of models

In [101]:
random.seed(111)

In [106]:
k = int( np.sqrt(y_train_u.shape[0]) )
k

163

In [113]:
import warnings
warnings.filterwarnings("ignore")

scoring = ['accuracy', 
           'precision', 'recall', 
           'f1']

clf_lr = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear', 
                            class_weight=class_weight)
clf_gnb = GaussianNB()
clf_knn = KNeighborsClassifier(n_neighbors=k)
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=111)
clf_svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', verbosity=0, silent=True, n_jobs=-1)

for c, n in zip(
    [clf_lr, 
     clf_gnb, 
     clf_knn,
     clf_rf, 
     clf_svm,
     clf_xgb, 
    ], 
    ['log_regC1', 
     'gauss_nb ', 
     f'knn_{k}  ', 
     'rand_frst',
     'svm_gamma', 
     'xgboost  ', 
    ]
):
    results = cross_validate(estimator=c,
                           X=X_pos_u[col_keep_test_u],
                           y=y_train_u,
                           cv=5,
                           scoring=scoring,
                           return_train_score=True)
    print(
        n,
        f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
#         f'Precision {results["test_precision"].mean():.3f}',
#         f'Recall {results["test_recall"].mean():.3f}',
        f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
        f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
        f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
    )

log_regC1 Accuracy 0.677+-0.188 F1 Score 0.560+-0.355           0.677+-0.188 | 0.560+-0.355
gauss_nb  Accuracy 0.687+-0.202 F1 Score 0.570+-0.363           0.687+-0.202 | 0.570+-0.363
knn_163   Accuracy 0.662+-0.174 F1 Score 0.555+-0.335           0.662+-0.174 | 0.555+-0.335
rand_frst Accuracy 0.686+-0.199 F1 Score 0.565+-0.363           0.686+-0.199 | 0.565+-0.363
svm_gamma Accuracy 0.679+-0.188 F1 Score 0.560+-0.356           0.679+-0.188 | 0.560+-0.356
xgboost   Accuracy 0.656+-0.147 F1 Score 0.546+-0.320           0.656+-0.147 | 0.546+-0.320


In [109]:
for c, n in zip(
    [clf_lr, 
     clf_gnb, 
     clf_knn,
     clf_rf, 
     clf_svm,
     clf_xgb, 
    ], 
    ['log_regC1', 
     'gauss_nb ', 
     f'knn_{k}  ', 
     'rand_frst',
     'svm_gamma', 
     'xgboost  ', 
    ]
):
    results = cross_validate(estimator=c,
                           X=X_emb_pca,
                           y=y_train_u,
                           cv=5,
                           scoring=scoring,
                           return_train_score=True)
    print(
        n,
        f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
#         f'Precision {results["test_precision"].mean():.3f}',
#         f'Recall {results["test_recall"].mean():.3f}',
        f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
        f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
        f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
    )

log_regC1 Accuracy 0.623+-0.124 F1 Score 0.536+-0.289           0.623+-0.124 | 0.536+-0.289
gauss_nb  Accuracy 0.630+-0.129 F1 Score 0.556+-0.274           0.630+-0.129 | 0.556+-0.274
knn_163   Accuracy 0.603+-0.098 F1 Score 0.586+-0.196           0.603+-0.098 | 0.586+-0.196
rand_frst Accuracy 0.660+-0.161 F1 Score 0.585+-0.281           0.660+-0.161 | 0.585+-0.281
svm_gamma Accuracy 0.618+-0.113 F1 Score 0.530+-0.290           0.618+-0.113 | 0.530+-0.290
xgboost   Accuracy 0.633+-0.110 F1 Score 0.573+-0.249           0.633+-0.110 | 0.573+-0.249


In [114]:
clf_v = VotingClassifier(estimators=[
    ('lr', clf_lr), 
    ('gnb', clf_gnb), 
    ('knn', clf_knn), 
    ('rf', clf_rf),
    ('svm', clf_svm),
    ('xgb', clf_xgb)
], voting='hard')

results = cross_validate(estimator=clf_v,
                       X=X_pos_u[col_keep_test_u],
                       y=y_train_u,
                       cv=5,
                       scoring=scoring,
                       return_train_score=True)
print(
    'voting',
    f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
    f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
    f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
    f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
)

voting Accuracy 0.687+-0.201 F1 Score 0.568+-0.365           0.687+-0.201 | 0.568+-0.365


In [115]:
results = cross_validate(estimator=clf_v,
                       X=X_emb_pca,
                       y=y_train_u,
                       cv=5,
                       scoring=scoring,
                       return_train_score=True)

print(
    'voting',
    f'Accuracy {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f}',
    f'F1 Score {results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}',
    f'          {results["test_accuracy"].mean():.3f}+-{results["test_accuracy"].std():.3f} |',
    f'{results["test_f1"].mean():.3f}+-{results["test_f1"].std():.3f}'
)

voting Accuracy 0.649+-0.150 F1 Score 0.555+-0.308           0.649+-0.150 | 0.555+-0.308


## Test on PL data

## Voting

In [116]:
clf_out = clone(clf_v)
clf_emb = clone(clf_v)

### train -> test

In [118]:
clf_out.fit(X_pos_u[col_keep_test_u], y_train_u)

In [119]:
clf_emb.fit(X_emb_pca, y_train_u)

In [120]:
y_pred = clf_out.predict(X_test[col_keep_test_u])
y_pred_e = clf_emb.predict(X_emb_test_pca)

In [121]:
confusion_matrix(y_test, y_pred)

array([[3363,   75],
       [3021,   82]])

In [122]:
confusion_matrix(y_test, y_pred_e)

array([[1291, 2147],
       [1062, 2041]])

In [123]:
print(
    n, '\n\t',
    f'Accuracy  {accuracy_score(y_test, y_pred):.3f}\n\t',
    f'Precision {precision_score(y_test, y_pred):.3f}\n\t',
    f'Recall    {recall_score(y_test, y_pred):.3f}\n\t',
    f'F1 Score  {f1_score(y_test, y_pred):.3f}\n\t',
)

xgboost   
	 Accuracy  0.527
	 Precision 0.522
	 Recall    0.026
	 F1 Score  0.050
	


In [124]:
print(
    n, '\n\t',
    f'Accuracy  {accuracy_score(y_test, y_pred_e):.3f}\n\t',
    f'Precision {precision_score(y_test, y_pred_e):.3f}\n\t',
    f'Recall    {recall_score(y_test, y_pred_e):.3f}\n\t',
    f'F1 Score  {f1_score(y_test, y_pred_e):.3f}\n\t',
)

xgboost   
	 Accuracy  0.509
	 Precision 0.487
	 Recall    0.658
	 F1 Score  0.560
	


## RF

In [125]:
clf_out = clone(clf_rf)
clf_emb = clone(clf_rf)

### train -> test

In [126]:
clf_out.fit(X_pos_u[col_keep_test_u], y_train_u)

In [127]:
clf_emb.fit(X_emb_pca, y_train_u)

In [128]:
y_pred = clf_out.predict(X_test[col_keep_test_u])
y_pred_e = clf_emb.predict(X_emb_test_pca)

In [129]:
confusion_matrix(y_test, y_pred)

array([[3399,   39],
       [3074,   29]])

In [130]:
confusion_matrix(y_test, y_pred_e)

array([[ 934, 2504],
       [ 822, 2281]])

In [131]:
print(
    n, '\n\t',
    f'Accuracy  {accuracy_score(y_test, y_pred):.3f}\n\t',
    f'Precision {precision_score(y_test, y_pred):.3f}\n\t',
    f'Recall    {recall_score(y_test, y_pred):.3f}\n\t',
    f'F1 Score  {f1_score(y_test, y_pred):.3f}\n\t',
)

xgboost   
	 Accuracy  0.524
	 Precision 0.426
	 Recall    0.009
	 F1 Score  0.018
	


In [132]:
print(
    n, '\n\t',
    f'Accuracy  {accuracy_score(y_test, y_pred_e):.3f}\n\t',
    f'Precision {precision_score(y_test, y_pred_e):.3f}\n\t',
    f'Recall    {recall_score(y_test, y_pred_e):.3f}\n\t',
    f'F1 Score  {f1_score(y_test, y_pred_e):.3f}\n\t',
)

xgboost   
	 Accuracy  0.492
	 Precision 0.477
	 Recall    0.735
	 F1 Score  0.578
	
