In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold # for cross validation
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics # for f1 macro in cross validation

import xgboost as xgb

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
from helper import util_visualizations, util_ml

# 0. Load Data

## 0.1 Goldstandard

In [2]:
# read data
gold = pd.read_csv('/path/to/9_FINAL/data/goldstandard/goldstandard_count.csv', sep=";")
del gold['Unnamed: 0']

In [3]:
gold.shape

(139, 65)

In [4]:
# get label of goldstandard
labels = pd.read_csv('/path/to/9_FINAL/data/goldstandard/goldstandard_raw.csv', sep=";")
del labels['Unnamed: 0']
labels.columns = ['_id', 'label']

In [5]:
# merge labels on gold
gold = pd.merge(gold, labels, how='inner', left_on=['_id'], right_on=['_id'])
gold.shape

(139, 66)

In [6]:
# get labels of goldstandard
y_gold = gold['label']
y_gold 

0      0
1      0
2      0
3      1
4      0
      ..
134    0
135    0
136    0
137    0
138    0
Name: label, Length: 139, dtype: int64

In [7]:
# sanity check
gold.label.value_counts()

0    134
1      5
Name: label, dtype: int64

In [8]:
##### get variables
X_gold = gold.copy()
del X_gold['label']
X_gold.shape

(139, 65)

In [9]:
X_gold

Unnamed: 0,_id,instance,class,frequency,pidspread,pldspread,modifications,p1,p10,p11,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,141663461,b1,waveform,3,2,3,['p34p8a'],0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,147713771,adjective,also,3,2,3,['p34p2'],0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,270852864,advance,cattle,1,1,1,['p23b'],0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,516170918,administration,"failure,period",1,1,1,['p8c'],0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,387574181,agnium,planet,1,1,1,['p8a'],0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,420639686,unlock,motivator,2,1,2,"['p8a', 'p8a']",0,0,0,...,0,0,0,0,0,0,2,0,0,0
135,199660983,venue,odds,3,3,3,['p7p5p1'],1,0,0,...,0,0,0,1,0,1,0,0,0,0
136,218064825,venezuelum,however,5,2,5,['p2p3a'],0,0,0,...,0,0,0,0,0,0,0,0,0,0
137,89573263,your,alena,2,1,2,"['p8a', 'p8a']",0,0,0,...,0,0,0,0,0,0,2,0,0,0


In [10]:
# only take top 10 patterns
X_gold_top_10 = X_gold[['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10']]
X_gold_top_10.head()

Unnamed: 0,p8a,p5,p1,p3a,p8b,p4,p2,p8c,p20a,p10
0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0


In [11]:
# only take one hot encoded columns
X_gold_one_hot = X_gold.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', '_id', 'modifications'], axis=1)
X_gold_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [12]:
all_columns = X_gold_one_hot.columns

In [13]:
# only take one hot encoded columns
X_gold_one_hot_more = X_gold.drop(['instance', 'class', '_id', 'modifications'], axis=1) 
X_gold_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,3,2,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,3,2,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [14]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_gold_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_gold_one_hot_more))
X_gold_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.046512,0.111111,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0
1,0.046512,0.111111,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0


In [15]:
# only take one hot encoded columns
X_gold_one_hot_frequency = X_gold[['frequency', 'pidspread', 'pldspread']]
X_gold_one_hot_frequency.head()

Unnamed: 0,frequency,pidspread,pldspread
0,3,2,3
1,3,2,3
2,1,1,1
3,1,1,1
4,1,1,1


In [16]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_gold_one_hot_frequency_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_gold_one_hot_frequency))
X_gold_one_hot_frequency_scaled.head()

Unnamed: 0,0,1,2
0,0.046512,0.111111,0.05
1,0.046512,0.111111,0.05
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0


## 0.2 Testing

In [17]:
# read test data
test = pd.read_csv('/path/to/9_FINAL/data/machine_learning/two_class/one-hot-ecoding/test/one_hot_test.csv', sep=";")
del test['Unnamed: 0']

In [18]:
# get label
y_test = test['label']

In [19]:
# get variables
X_test = test.copy()
del X_test['label']
X_test.shape

(348121, 65)

In [20]:
# only take top 10 patterns
X_test_top_10 = X_test[['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10']]
X_test_top_10.head()

Unnamed: 0,p8a,p5,p1,p3a,p8b,p4,p2,p8c,p20a,p10
0,0,1,0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0


In [21]:
# only take one hot encoded columns
X_test_one_hot = X_test.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_test_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# only take one hot encoded columns
X_test_one_hot_more = X_test.drop(['instance', 'class',  'id', 'pids'], axis=1)
X_test_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,2,2,2,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,11,3,10,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,36,3,33,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,13,2,13,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_more))
X_test_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,1.5e-05,0.020833,0.000132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.000155,0.041667,0.00119,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.000541,0.041667,0.004232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0.000185,0.020833,0.001587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# only take one hot encoded columns
X_test_one_hot_frequency = X_test[['frequency', 'pidspread', 'pldspread']]
X_test_one_hot_frequency.head()

Unnamed: 0,frequency,pidspread,pldspread
0,2,2,2
1,11,3,10
2,1,1,1
3,36,3,33
4,13,2,13


In [25]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_frequency_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_frequency))
X_test_one_hot_frequency_scaled.head()

Unnamed: 0,0,1,2
0,1.5e-05,0.020833,0.000132
1,0.000155,0.041667,0.00119
2,0.0,0.0,0.0
3,0.000541,0.041667,0.004232
4,0.000185,0.020833,0.001587


# 1. Naive Bayes

## 1.1 Only count

In [26]:
# fit NB model
nb = MultinomialNB()
util_ml.get_count_cross_validation_two_class(nb, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'])

[[984304 315706]
 [ 12427  10419]]
Positive
Precision: 0.03194787274817938 || Recall: 0.4560535761183577 || F1: 0.05971269818982093
Negative
Precision: 0.9875322429020468 || Recall: 0.7571510988377013 || F1: 0.8571310391550462
[[986754 313256]
 [ 12426  10420]]
Positive
Precision: 0.03219268651367417 || Recall: 0.45609734745688524 || F1: 0.06014048170101754
Negative
Precision: 0.9875638023179006 || Recall: 0.7590356997253868 || F1: 0.858349244734015
[[982598 317412]
 [ 12471  10375]]
Positive
Precision: 0.03165165183488058 || Recall: 0.4541276372231463 || F1: 0.059178685406108385
Negative
Precision: 0.987467200766982 || Recall: 0.7558388012399905 || F1: 0.8562650784569944
[[993111 306899]
 [ 12542  10304]]
Positive
Precision: 0.032483929849339385 || Recall: 0.4510198721876915 || F1: 0.06060303073968752
Negative
Precision: 0.9875285013816893 || Recall: 0.7639256621102914 || F1: 0.8614537336982898
[[986802 313210]
 [ 12787  10062]]
Positive
Precision: 0.03112549184587592 || Recall: 0.440

In [27]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(nb, X_gold_one_hot, y_gold, 
                                ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=False)

[[112  22]
 [  1   4]]
Positive
Precision: 0.15384615384615385 || Recall: 0.8 || F1: 0.25806451612903225
Negative
Precision: 0.9911504424778761 || Recall: 0.835820895522388 || F1: 0.9068825910931174


## 1.2 Count with frequencies

In [28]:
# fit NB model
nb = MultinomialNB()
util_ml.get_count_cross_validation_two_class(nb, ['instance', 'class', 'id', 'pids'])

[[1250595   49415]
 [  20037    2809]]
Positive
Precision: 0.0537875306372549 || Recall: 0.12295368992383787 || F1: 0.07483681896896231
Negative
Precision: 0.9842306820542848 || Recall: 0.961988753932662 || F1: 0.9729826245739391
[[1251365   48645]
 [  20034    2812]]
Positive
Precision: 0.05464756981557417 || Recall: 0.12308500393942047 || F1: 0.0756900798083523
Negative
Precision: 0.9842425548549275 || Recall: 0.9625810570687918 || F1: 0.9732912967170917
[[1251932   48078]
 [  20166    2680]]
Positive
Precision: 0.052799558690255724 || Recall: 0.11730718725378622 || F1: 0.07282212923210694
Negative
Precision: 0.984147447759528 || Recall: 0.9630172075599418 || F1: 0.9734676770959851
[[1254315   45695]
 [  20345    2501]]
Positive
Precision: 0.051892273217694414 || Recall: 0.10947211765735797 || F1: 0.07040905379916106
Negative
Precision: 0.9840388809564904 || Recall: 0.9648502703825356 || F1: 0.9743501108879973
[[1252557   47455]
 [  20245    2604]]
Positive
Precision: 0.0520186180307

In [29]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(nb, X_gold_one_hot_more, y_gold, 
                                ['instance', 'class', 'id', 'pids'], scale=False)

[[131   3]
 [  4   1]]
Positive
Precision: 0.25 || Recall: 0.2 || F1: 0.22222222222222224
Negative
Precision: 0.9703703703703703 || Recall: 0.9776119402985075 || F1: 0.9739776951672863


## 1.3 Scaled count pids with frequencies

In [30]:
# fit NB model
nb = MultinomialNB()
util_ml.get_count_cross_validation_two_class(nb, ['instance', 'class', 'id', 'pids'], scale=True)

[[608717 691293]
 [  9917  12929]]
Positive
Precision: 0.018359267390112777 || Recall: 0.5659196358224634 || F1: 0.03556476148035673
Negative
Precision: 0.9839695199423245 || Recall: 0.468240244305813 || F1: 0.6345283439762666
[[606493 693517]
 [  9839  13007]]
Positive
Precision: 0.018409848780791594 || Recall: 0.5693338002276109 || F1: 0.035666397027571746
Negative
Precision: 0.9840362012681477 || Recall: 0.4665294882347059 || F1: 0.6329694803954619
[[601576 698434]
 [  9319  13527]]
Positive
Precision: 0.01899963621602869 || Recall: 0.5920948962619277 || F1: 0.036817831076731714
Negative
Precision: 0.9847453326676434 || Recall: 0.46274720963684896 || F1: 0.6296241833058158
[[603608 696402]
 [  9471  13375]]
Positive
Precision: 0.018843946760743163 || Recall: 0.5854416528057428 || F1: 0.03651264019830117
Negative
Precision: 0.9845517461860543 || Recall: 0.4643102745363497 || F1: 0.6310297116339072
[[593434 706578]
 [  9418  13431]]
Positive
Precision: 0.018653933492498012 || Recall: 

In [31]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(nb, X_gold_one_hot_more, y_gold, 
                               ['instance', 'class', 'id', 'pids'], scale=True)

[[ 27 107]
 [  1   4]]
Positive
Precision: 0.036036036036036036 || Recall: 0.8 || F1: 0.06896551724137931
Negative
Precision: 0.9642857142857143 || Recall: 0.20149253731343283 || F1: 0.33333333333333337


## 1.4 Top 10 patterns

In [28]:
# fit NB model
nb = MultinomialNB()
util_ml.get_count_cross_validation_two_class_frequency(nb, ['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10'])

[[991111 308899]
 [ 13646   9200]]
Positive
Precision: 0.028921813649209836 || Recall: 0.40269631445329596 || F1: 0.05396764874099928
Negative
Precision: 0.9864186066879852 || Recall: 0.7623872124060584 || F1: 0.8600530986429431
[[987799 312211]
 [ 13594   9252]]
Positive
Precision: 0.0287809172439752 || Recall: 0.4049724240567277 || F1: 0.053742423230296044
Negative
Precision: 0.9864249101002304 || Recall: 0.7598395396958485 || F1: 0.8584320086486374
[[988604 311406]
 [ 13537   9309]]
Positive
Precision: 0.02902577054394088 || Recall: 0.407467390352797 || F1: 0.054191249879933984
Negative
Precision: 0.9864919207975724 || Recall: 0.7604587657018023 || F1: 0.8588524384369227
[[1004585  295425]
 [  13990    8856]]
Positive
Precision: 0.029104676269632347 || Recall: 0.3876389739998249 || F1: 0.05414410916861035
Negative
Precision: 0.9862651252975971 || Recall: 0.7727517480634765 || F1: 0.8665500725658106
[[992207 307805]
 [ 13809   9040]]
Positive
Precision: 0.028531300793763512 || Recall

In [29]:
# not top 10 columns
train = pd.read_csv('/path/to/9_FINAL/data/machine_learning/two_class/one-hot-ecoding/train/cross_validation/training/training_0.csv',sep=";")
del train['Unnamed: 0']
del train['Unnamed: 0.1']
del train['label']
train = train.drop(['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10'], axis=1)
not_top10_columns = train.columns.to_list()

In [30]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(nb, X_gold_top_10, y_gold, 
                               not_top10_columns)

[[54 80]
 [ 1  4]]
Positive
Precision: 0.047619047619047616 || Recall: 0.8 || F1: 0.0898876404494382
Negative
Precision: 0.9818181818181818 || Recall: 0.40298507462686567 || F1: 0.5714285714285714


## 1.5 Count scaled

In [31]:
# fit NB model
nb = MultinomialNB()
util_ml.get_count_cross_validation_two_class(nb, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[360339 939671]
 [  2475  20371]]
Positive
Precision: 0.021218863341395482 || Recall: 0.8916659371443578 || F1: 0.04145131490057871
Negative
Precision: 0.99317832277696 || Recall: 0.27718171398681546 || F1: 0.43340606101427454
[[366757 933253]
 [  2540  20306]]
Positive
Precision: 0.021294959200217293 || Recall: 0.8888208001400683 || F1: 0.041593396182936386
Negative
Precision: 0.993122067062554 || Recall: 0.2821185990876993 || F1: 0.43941228306117447
[[437538 862472]
 [  3164  19682]]
Positive
Precision: 0.022311297120457426 || Recall: 0.8615074848988882 || F1: 0.04349613259668508
Negative
Precision: 0.9928205454025623 || Recall: 0.3365651033453589 || F1: 0.5027115341308613
[[357675 942335]
 [  2291  20555]]
Positive
Precision: 0.021347194383574446 || Recall: 0.8997198634334238 || F1: 0.04170487838528775
Negative
Precision: 0.9936355100203909 || Recall: 0.2751324989807771 || F1: 0.43093996539709006
[[446046 853966]
 [  3340  19509]]
Positive
Precision: 0.022334926586336186 || Recall:

In [32]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(nb, X_gold_one_hot, y_gold, 
                                ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[39 95]
 [ 1  4]]
Positive
Precision: 0.04040404040404041 || Recall: 0.8 || F1: 0.07692307692307693
Negative
Precision: 0.975 || Recall: 0.291044776119403 || F1: 0.4482758620689656


# 2. Decision Trees

## 2.1 Only count

In [62]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(dt, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'])

[[638110 661900]
 [  5870  16976]]
Positive
Precision: 0.0250060393945286 || Recall: 0.7430622428433862 || F1: 0.04838383291388897
Negative
Precision: 0.9908848100872698 || Recall: 0.49085007038407397 || F1: 0.6564951465799721
[[640439 659571]
 [  5846  17000]]
Positive
Precision: 0.025126705105598675 || Recall: 0.744112754968047 || F1: 0.048611915352357746
Negative
Precision: 0.9909544550778682 || Recall: 0.4926415950646533 || F1: 0.6581109235753059
[[641302 658708]
 [  5818  17028]]
Positive
Precision: 0.02519919021629749 || Recall: 0.7453383524468178 || F1: 0.048750182512575485
Negative
Precision: 0.9910093954753368 || Recall: 0.4933054361120299 || F1: 0.6587151345826935
[[639341 660669]
 [  5764  17082]]
Positive
Precision: 0.025203946582151852 || Recall: 0.7477020047273045 || F1: 0.048764125453006506
Negative
Precision: 0.9910650204230319 || Recall: 0.4917969861770294 || F1: 0.6573811831177077
[[640008 660004]
 [  5874  16975]]
Positive
Precision: 0.025074633038838723 || Recall: 0

In [63]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(dt, X_gold_one_hot, y_gold, 
                                ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=False)

[[89 45]
 [ 3  2]]
Positive
Precision: 0.0425531914893617 || Recall: 0.4 || F1: 0.07692307692307691
Negative
Precision: 0.967391304347826 || Recall: 0.664179104477612 || F1: 0.7876106194690267


## 2.2 Count with frequencies

In [31]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(dt, ['instance', 'class', 'id', 'pids'])

[[658679 641331]
 [  6496  16350]]
Positive
Precision: 0.024860076541666856 || Recall: 0.715661384925151 || F1: 0.04805099577239404
Negative
Precision: 0.9902341489081821 || Recall: 0.5066722563672588 || F1: 0.6703480842770528
[[669096 630914]
 [  6753  16093]]
Positive
Precision: 0.024872992100549145 || Recall: 0.7044121509235752 || F1: 0.048049348140562186
Negative
Precision: 0.9900081231162582 || Recall: 0.5146852716517565 || F1: 0.6772709996006798
[[667664 632346]
 [  6685  16161]]
Positive
Precision: 0.024920316974219245 || Recall: 0.7073886019434474 || F1: 0.04814456776092458
Negative
Precision: 0.9900867355034263 || Recall: 0.5135837416635257 || F1: 0.6763349522553903
[[657371 642639]
 [  6481  16365]]
Positive
Precision: 0.024832929693901707 || Recall: 0.7163179550030639 || F1: 0.0480017599178705
Negative
Precision: 0.9902372818037756 || Recall: 0.5056661102606903 || F1: 0.6694676102495999
[[664032 635980]
 [  6653  16196]]
Positive
Precision: 0.024833787198547633 || Recall: 0.

In [32]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(dt, X_gold_one_hot_more, y_gold, 
                                ['instance', 'class', 'id', 'pids'], scale=False)

[[85 49]
 [ 2  3]]
Positive
Precision: 0.057692307692307696 || Recall: 0.6 || F1: 0.10526315789473685
Negative
Precision: 0.9770114942528736 || Recall: 0.6343283582089553 || F1: 0.7692307692307694


## 2.3 Scaled count pids with frequencies

In [43]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(dt, ['instance', 'class', 'id', 'pids'], scale=True)

[[923859 376151]
 [ 12544  10302]]
Positive
Precision: 0.026657834199760384 || Recall: 0.45093232951063644 || F1: 0.05033972719210161
Negative
Precision: 0.9866040582954134 || Recall: 0.7106553026515181 || F1: 0.8261971290633706
[[661369 638641]
 [  6843  16003]]
Positive
Precision: 0.02444534739491999 || Recall: 0.7004727304560974 || F1: 0.047242025712556646
Negative
Precision: 0.9897592380861164 || Recall: 0.5087414712194521 || F1: 0.6720471572820546
[[626000 674010]
 [  6848  15998]]
Positive
Precision: 0.02318523843201818 || Recall: 0.7002538737634597 || F1: 0.044884366223658705
Negative
Precision: 0.9891790761762698 || Recall: 0.4815347574249429 || F1: 0.6477454629362323
[[657443 642567]
 [  7347  15499]]
Positive
Precision: 0.023552348852546704 || Recall: 0.6784119758382211 || F1: 0.04552423808069177
Negative
Precision: 0.9889483897170535 || Recall: 0.5057214944500427 || F1: 0.6692212947882736
[[667565 632447]
 [  7614  15235]]
Positive
Precision: 0.02352234584255854 || Recall: 0

In [34]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(dt, X_gold_one_hot_more, y_gold, 
                               ['instance', 'class', 'id', 'pids'], scale=True)

[[103  31]
 [  3   2]]
Positive
Precision: 0.06060606060606061 || Recall: 0.4 || F1: 0.10526315789473685
Negative
Precision: 0.9716981132075472 || Recall: 0.7686567164179104 || F1: 0.8583333333333334


## 2.4 Top 10 patterns

In [35]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class_frequency(dt, ['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10'])

[[626425 673585]
 [  6412  16434]]
Positive
Precision: 0.023816735481196893 || Recall: 0.7193381773614637 || F1: 0.04610690663730159
Negative
Precision: 0.9898678490669793 || Recall: 0.4818616779870924 || F1: 0.648188915108128
[[620298 679712]
 [  6265  16581]]
Positive
Precision: 0.02381325103081605 || Recall: 0.7257725641250109 || F1: 0.04611347736668432
Negative
Precision: 0.9900010054854819 || Recall: 0.4771486373181745 || F1: 0.6439392641752999
[[632708 667302]
 [  6517  16329]]
Positive
Precision: 0.023885692720195546 || Recall: 0.7147421868160728 || F1: 0.04622655797711744
Negative
Precision: 0.9898048418006179 || Recall: 0.48669471773294054 || F1: 0.6525336021678652
[[629971 670039]
 [  6314  16532]]
Positive
Precision: 0.02407908286251531 || Recall: 0.7236277685371618 || F1: 0.04660728457310721
Negative
Precision: 0.990076773772759 || Recall: 0.4845893493126976 || F1: 0.650697336924384
[[622542 677470]
 [  6226  16623]]
Positive
Precision: 0.023949240231496357 || Recall: 0.727

In [36]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(dt, X_gold_top_10, y_gold, 
                               not_top10_columns)

[[60 74]
 [ 2  3]]
Positive
Precision: 0.03896103896103896 || Recall: 0.6 || F1: 0.07317073170731707
Negative
Precision: 0.967741935483871 || Recall: 0.44776119402985076 || F1: 0.6122448979591837


## 2.5 Count normalized

In [37]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(dt, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[865394 434616]
 [ 10524  12322]]
Positive
Precision: 0.027569819527540732 || Recall: 0.5393504333362514 || F1: 0.05245815098002486
Negative
Precision: 0.987985176694622 || Recall: 0.6656825716725256 || F1: 0.7954252162755384
[[590452 709558]
 [  5477  17369]]
Positive
Precision: 0.023893733483554745 || Recall: 0.7602643788847063 || F1: 0.046331356290503924
Negative
Precision: 0.9908093078202269 || Recall: 0.45419035238190475 || F1: 0.6228597017098124
[[606771 693239]
 [  6125  16721]]
Positive
Precision: 0.023552031100343682 || Recall: 0.7319005515188655 || F1: 0.04563554337710118
Negative
Precision: 0.9900064611288049 || Recall: 0.46674333274359425 || F1: 0.6343970900817917
[[615860 684150]
 [  6329  16517]]
Positive
Precision: 0.023573252343838086 || Recall: 0.7229711984592488 || F1: 0.04565778361964471
Negative
Precision: 0.989827849736977 || Recall: 0.47373481742448137 || F1: 0.6407869320502196
[[641079 658933]
 [  6649  16200]]
Positive
Precision: 0.02399527204269381 || Recall: 

In [38]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(dt, X_gold_one_hot, y_gold, 
                                ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[90 44]
 [ 3  2]]
Positive
Precision: 0.043478260869565216 || Recall: 0.4 || F1: 0.0784313725490196
Negative
Precision: 0.967741935483871 || Recall: 0.6716417910447762 || F1: 0.7929515418502204


# 3. Random Forest

## 3.1 Only count

In [39]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(rf, ['instance', 'class', 'id', 'pids'])

[[668658 631352]
 [  5672  17174]]
Positive
Precision: 0.026481590560748527 || Recall: 0.7517289678718375 || F1: 0.05116090632317105
Negative
Precision: 0.9915886880310827 || Recall: 0.5143483511665294 || F1: 0.677348379711701
[[675684 624326]
 [  5800  17046]]
Positive
Precision: 0.026577399699394422 || Recall: 0.7461262365403134 || F1: 0.051326522316468386
Negative
Precision: 0.9914891618878799 || Recall: 0.5197529249775001 || F1: 0.6819944950628162
[[674349 625661]
 [  5743  17103]]
Positive
Precision: 0.026608521945846376 || Recall: 0.7486212028363828 || F1: 0.05139045386938297
Negative
Precision: 0.9915555542485428 || Recall: 0.5187260097999247 || F1: 0.681125517776357
[[667985 632025]
 [  5557  17289]]
Positive
Precision: 0.026626562803204613 || Recall: 0.7567626718025037 || F1: 0.051443108783622946
Negative
Precision: 0.9917495865142782 || Recall: 0.513830662841055 || F1: 0.6769368124072738
[[671873 628139]
 [  5709  17140]]
Positive
Precision: 0.02656215373505104 || Recall: 0.7

In [40]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(rf, X_gold_one_hot_more, y_gold, 
                                ['instance', 'class', 'id', 'pids'], scale=False)

[[88 46]
 [ 2  3]]
Positive
Precision: 0.061224489795918366 || Recall: 0.6 || F1: 0.1111111111111111
Negative
Precision: 0.9777777777777777 || Recall: 0.6567164179104478 || F1: 0.7857142857142858


## 3.2 Count with frequencies

In [41]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(rf, ['instance', 'class', 'id', 'pids'])

[[668658 631352]
 [  5672  17174]]
Positive
Precision: 0.026481590560748527 || Recall: 0.7517289678718375 || F1: 0.05116090632317105
Negative
Precision: 0.9915886880310827 || Recall: 0.5143483511665294 || F1: 0.677348379711701
[[675684 624326]
 [  5800  17046]]
Positive
Precision: 0.026577399699394422 || Recall: 0.7461262365403134 || F1: 0.051326522316468386
Negative
Precision: 0.9914891618878799 || Recall: 0.5197529249775001 || F1: 0.6819944950628162
[[674349 625661]
 [  5743  17103]]
Positive
Precision: 0.026608521945846376 || Recall: 0.7486212028363828 || F1: 0.05139045386938297
Negative
Precision: 0.9915555542485428 || Recall: 0.5187260097999247 || F1: 0.681125517776357
[[667985 632025]
 [  5557  17289]]
Positive
Precision: 0.026626562803204613 || Recall: 0.7567626718025037 || F1: 0.051443108783622946
Negative
Precision: 0.9917495865142782 || Recall: 0.513830662841055 || F1: 0.6769368124072738
[[671873 628139]
 [  5709  17140]]
Positive
Precision: 0.02656215373505104 || Recall: 0.7

In [42]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(rf, X_gold_one_hot_more, y_gold, 
                               ['instance', 'class', 'id', 'pids'])

[[88 46]
 [ 2  3]]
Positive
Precision: 0.061224489795918366 || Recall: 0.6 || F1: 0.1111111111111111
Negative
Precision: 0.9777777777777777 || Recall: 0.6567164179104478 || F1: 0.7857142857142858


## 3.3 Scaled count pids with frequencies

In [44]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(rf, ['instance', 'class', 'id', 'pids'], scale=True)

[[944426 355584]
 [ 11367  11479]]
Positive
Precision: 0.03127256084105453 || Recall: 0.5024511949575418 || F1: 0.05888040542793317
Negative
Precision: 0.9881072575337966 || Recall: 0.7264759501849986 || F1: 0.8373302101291645
[[657886 642124]
 [  5800  17046]]
Positive
Precision: 0.025859793376518957 || Recall: 0.7461262365403134 || F1: 0.04998709707690142
Negative
Precision: 0.9912609276073324 || Recall: 0.5060622610595303 || F1: 0.6700487244461464
[[629181 670829]
 [  5780  17066]]
Positive
Precision: 0.024809018818278954 || Recall: 0.747001663310864 || F1: 0.04802311953299444
Negative
Precision: 0.9908970787182205 || Recall: 0.48398166167952555 || F1: 0.6503260255580058
[[621092 678918]
 [  5744  17102]]
Positive
Precision: 0.024571133013419153 || Recall: 0.7485774314978552 || F1: 0.04758049483492055
Negative
Precision: 0.9908365186428348 || Recall: 0.477759401850755 || F1: 0.6446721741125134
[[630525 669487]
 [  5817  17032]]
Positive
Precision: 0.024809218681493155 || Recall: 0.7

In [45]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(rf, X_gold_one_hot_more, y_gold, 
                               ['instance', 'class', 'id', 'pids'], scale=True)

[[120  14]
 [  4   1]]
Positive
Precision: 0.06666666666666667 || Recall: 0.2 || F1: 0.1
Negative
Precision: 0.967741935483871 || Recall: 0.8955223880597015 || F1: 0.930232558139535


## 3.4 Top 10 patterns

In [46]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class_frequency(rf, ['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10'])

[[623745 676265]
 [  5741  17105]]
Positive
Precision: 0.024669368446860983 || Recall: 0.7487087455134378 || F1: 0.04776492007997587
Negative
Precision: 0.9908798607117553 || Recall: 0.47980015538342013 || F1: 0.6465367121776878
[[611618 688392]
 [  5544  17302]]
Positive
Precision: 0.02451770880863377 || Recall: 0.7573316992033616 || F1: 0.047497735196420234
Negative
Precision: 0.991016945307715 || Recall: 0.47047176560180304 || F1: 0.6380418658315476
[[625486 674524]
 [  5841  17005]]
Positive
Precision: 0.024590436554359975 || Recall: 0.7443316116606846 || F1: 0.047608048993875766
Negative
Precision: 0.9907480592466344 || Recall: 0.481139375850955 || F1: 0.6477233129174246
[[623476 676534]
 [  5692  17154]]
Positive
Precision: 0.02472869647449574 || Recall: 0.7508535411012869 || F1: 0.047880491365378336
Negative
Precision: 0.9909531317549526 || Recall: 0.4795932338982008 || F1: 0.6463644101270074
[[617021 682991]
 [  5576  17273]]
Positive
Precision: 0.024666411524796362 || Recall: 

In [47]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(rf, X_gold_top_10, y_gold, 
                               not_top10_columns)

[[58 76]
 [ 2  3]]
Positive
Precision: 0.0379746835443038 || Recall: 0.6 || F1: 0.07142857142857144
Negative
Precision: 0.9666666666666667 || Recall: 0.43283582089552236 || F1: 0.5979381443298969


## 3.5 Count normalized

In [48]:
## fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(rf, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[845104 454906]
 [  9621  13225]]
Positive
Precision: 0.0282506392441432 || Recall: 0.578875952026613 || F1: 0.053872177311768174
Negative
Precision: 0.9887437479891193 || Recall: 0.6500749994230813 || F1: 0.7844157170139251
[[580402 719608]
 [  4631  18215]]
Positive
Precision: 0.02468749279976363 || Recall: 0.7972949312789985 || F1: 0.04789205291657738
Negative
Precision: 0.9920842072156614 || Recall: 0.4464596426181337 || F1: 0.6157970932228072
[[598841 701169]
 [  5217  17629]]
Positive
Precision: 0.0245256664598399 || Recall: 0.7716449269018647 || F1: 0.04754032932242424
Negative
Precision: 0.991363412122677 || Recall: 0.46064337966631025 || F1: 0.6290121991441482
[[600617 699393]
 [  5461  17385]]
Positive
Precision: 0.024254371646451202 || Recall: 0.7609647203011468 || F1: 0.0470103728380907
Negative
Precision: 0.9909896085982333 || Recall: 0.4620095230036692 || F1: 0.6302090984256761
[[632405 667607]
 [  5711  17138]]
Positive
Precision: 0.02502829520478426 || Recall: 0.750054

In [49]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(rf, X_gold_one_hot, y_gold, 
                                ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[110  24]
 [  2   3]]
Positive
Precision: 0.1111111111111111 || Recall: 0.6 || F1: 0.18750000000000003
Negative
Precision: 0.9821428571428571 || Recall: 0.8208955223880597 || F1: 0.8943089430894309


# 4. Neural Network

## 4.1 Count columns

In [50]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(nnet, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'])

[[666365 633645]
 [  5454  17392]]
Positive
Precision: 0.0267143034881274 || Recall: 0.7612711196708395 || F1: 0.0516172688730833
Negative
Precision: 0.9918817419572832 || Recall: 0.5125845185806263 || F1: 0.675885180712932
[[669479 630531]
 [  5467  17379]]
Positive
Precision: 0.02682316988470621 || Recall: 0.7607020922699816 || F1: 0.05181914138673377
Negative
Precision: 0.9919000927481606 || Recall: 0.5149798847701171 || F1: 0.6779685218303597
[[925550 374460]
 [ 10294  12552]]
Positive
Precision: 0.03243310284952405 || Recall: 0.5494178411975839 || F1: 0.06125048187421009
Negative
Precision: 0.9890003034693816 || Recall: 0.7119560618764471 || F1: 0.8279163129614008
[[944172 355838]
 [ 10624  12222]]
Positive
Precision: 0.03320654241156333 || Recall: 0.5349732994834983 || F1: 0.06253165722705714
Negative
Precision: 0.9888730158065179 || Recall: 0.726280567072561 || F1: 0.837475153073036
[[680349 619663]
 [  5503  17346]]
Positive
Precision: 0.02723038450006201 || Recall: 0.759157950

In [51]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(nnet, X_gold_one_hot, y_gold, 
                                ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=False)

[[89 45]
 [ 2  3]]
Positive
Precision: 0.0625 || Recall: 0.6 || F1: 0.11320754716981132
Negative
Precision: 0.978021978021978 || Recall: 0.664179104477612 || F1: 0.7911111111111112


## 4.2 Count with frequencies

In [52]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(nnet, ['instance', 'class', 'id', 'pids'])



[[676647 623363]
 [  5450  17396]]
Positive
Precision: 0.027149052920052624 || Recall: 0.7614462050249496 || F1: 0.05242877916833056
Negative
Precision: 0.9920099340709606 || Recall: 0.5204936885100884 || F1: 0.6827552700232631
[[689877 610133]
 [  5635  17211]]
Positive
Precision: 0.027434708867861972 || Recall: 0.7533485073973563 || F1: 0.052941447884464546
Negative
Precision: 0.991898054958074 || Recall: 0.5306705333035899 || F1: 0.6914251008006926
[[676357 623653]
 [  5419  17427]]
Positive
Precision: 0.027183814812503898 || Recall: 0.7628031165193032 || F1: 0.0524968144040149
Negative
Precision: 0.9920516415948933 || Recall: 0.5202706133029746 || F1: 0.6825731940784728
[[684580 615430]
 [  5498  17348]]
Positive
Precision: 0.02741561811567406 || Recall: 0.7593451807756281 || F1: 0.05292057642795261
Negative
Precision: 0.9920327846997006 || Recall: 0.5265959492619288 || F1: 0.6879896768384113
[[676925 623087]
 [  5425  17424]]
Positive
Precision: 0.027203279881219837 || Recall: 0.7

In [53]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(nnet, X_gold_one_hot_more, y_gold, 
                                ['instance', 'class', 'id', 'pids'], scale=False)

[[91 43]
 [ 2  3]]
Positive
Precision: 0.06521739130434782 || Recall: 0.6 || F1: 0.11764705882352941
Negative
Precision: 0.978494623655914 || Recall: 0.6791044776119403 || F1: 0.8017621145374448




## 4.3 Scaled count pids with frequencies

In [54]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(nnet, ['instance', 'class', 'id', 'pids'], scale=True)

[[918949 381061]
 [ 10683  12163]]
Positive
Precision: 0.030931479258641384 || Recall: 0.5323907905103739 || F1: 0.058466123488836015
Negative
Precision: 0.9885083559946302 || Recall: 0.706878408627626 || F1: 0.824301838591128
[[450633 849377]
 [  3222  19624]]
Positive
Precision: 0.022582252494531076 || Recall: 0.8589687472642913 || F1: 0.044007548379935126
Negative
Precision: 0.9929008163400205 || Recall: 0.34663810278382473 || F1: 0.5138742149481288
[[559354 740656]
 [  4329  18517]]
Positive
Precision: 0.02439101495969957 || Recall: 0.8105138755143132 || F1: 0.04735690565062998
Negative
Precision: 0.9923201515745552 || Recall: 0.43026899793078516 || F1: 0.6002640992910313
[[510949 789061]
 [  3787  19059]]
Positive
Precision: 0.02358436865812008 || Recall: 0.8342379409962357 || F1: 0.045871912930252265
Negative
Precision: 0.9926428304995182 || Recall: 0.3930346689640849 || F1: 0.5631080051974215
[[829803 470209]
 [  9111  13738]]
Positive
Precision: 0.02838740605892794 || Recall: 0

In [55]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(nnet, X_gold_one_hot_more, y_gold, 
                               ['instance', 'class', 'id', 'pids'], scale=True)

[[68 66]
 [ 3  2]]
Positive
Precision: 0.029411764705882353 || Recall: 0.4 || F1: 0.0547945205479452
Negative
Precision: 0.9577464788732394 || Recall: 0.5074626865671642 || F1: 0.6634146341463415


## 4.4 Top 10 patterns

In [56]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class_frequency(nnet, ['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10'])

[[604777 695233]
 [  5139  17707]]
Positive
Precision: 0.02483659213959099 || Recall: 0.7750590913070121 || F1: 0.048130842391673666
Negative
Precision: 0.991574249568793 || Recall: 0.4652094983884739 || F1: 0.6332988817367793
[[607057 692953]
 [  5162  17684]]
Positive
Precision: 0.02488471610681684 || Recall: 0.774052350520879 || F1: 0.04821924979856384
Negative
Precision: 0.9915683766756667 || Recall: 0.4669633310512996 || F1: 0.634920817538067
[[929645 370365]
 [ 11047  11799]]
Positive
Precision: 0.03087417967155462 || Recall: 0.516458023286352 || F1: 0.05826522801906126
Negative
Precision: 0.988256517542405 || Recall: 0.7151060376458642 || F1: 0.8297801314052471
[[894984 405026]
 [ 10374  12472]]
Positive
Precision: 0.029873196997350887 || Recall: 0.5459161341153812 || F1: 0.05664662173210036
Negative
Precision: 0.9885415493097758 || Recall: 0.6884439350466535 || F1: 0.8116414131337717
[[603106 696906]
 [  5063  17786]]
Positive
Precision: 0.024886244704012357 || Recall: 0.778414

In [57]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(nnet, X_gold_top_10, y_gold, 
                               not_top10_columns)

[[70 64]
 [ 3  2]]
Positive
Precision: 0.030303030303030304 || Recall: 0.4 || F1: 0.05633802816901409
Negative
Precision: 0.958904109589041 || Recall: 0.5223880597014925 || F1: 0.6763285024154589


## 4.5 Counts scaled

In [58]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(nnet, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[975635 324375]
 [ 11695  11151]]
Positive
Precision: 0.033234384220596914 || Recall: 0.48809419592051123 || F1: 0.06223142433002578
Negative
Precision: 0.9881549228727984 || Recall: 0.7504826885947031 || F1: 0.8530738762055489
[[613398 686612]
 [  4734  18112]]
Positive
Precision: 0.025700841747975094 || Recall: 0.7927864834106627 || F1: 0.04978764929834931
Negative
Precision: 0.9923414416338258 || Recall: 0.4718409858385705 || F1: 0.6395751722239542
[[588002 712008]
 [  4496  18350]]
Positive
Precision: 0.025124664890368832 || Recall: 0.8032040619802153 || F1: 0.04872517936707718
Negative
Precision: 0.9924117887317763 || Recall: 0.45230575149421925 || F1: 0.6213997510182255
[[617983 682027]
 [  5189  17657]]
Positive
Precision: 0.025235677820273153 || Recall: 0.7728705243806355 || F1: 0.04887547921885597
Negative
Precision: 0.9916732459096365 || Recall: 0.47536788178552475 || F1: 0.6426672046639372
[[611242 688770]
 [  4830  18019]]
Positive
Precision: 0.025494171527853434 || Recall

In [59]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(nnet, X_gold_one_hot, y_gold, 
                                ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[108  26]
 [  3   2]]
Positive
Precision: 0.07142857142857142 || Recall: 0.4 || F1: 0.12121212121212122
Negative
Precision: 0.972972972972973 || Recall: 0.8059701492537313 || F1: 0.8816326530612245


# 5. Logistic Regression

## 5.1 Only count

In [60]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_count_cross_validation_two_class(log_reg, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1101494  198516]
 [  14878    7968]]
Positive
Precision: 0.03858894635903993 || Recall: 0.34877002538737634 || F1: 0.06948938211311212
Negative
Precision: 0.9866729011476462 || Recall: 0.8472965592572365 || F1: 0.9116886320126536


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1115266  184744]
 [  15438    7408]]
Positive
Precision: 0.03855281235688413 || Recall: 0.3242580758119583 || F1: 0.06891226895133908
Negative
Precision: 0.9863465593117209 || Recall: 0.8578903239205852 || F1: 0.9176447743337967


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1126355  173655]
 [  15617    7229]]
Positive
Precision: 0.03996483934455231 || Recall: 0.31642300621553004 || F1: 0.07096647523683307
Negative
Precision: 0.9863245333510804 || Recall: 0.8664202583057053 || F1: 0.9224924671844427


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1118055  181955]
 [  15667    7179]]
Positive
Precision: 0.03795721551915573 || Recall: 0.31423443928915346 || F1: 0.06773280498160204
Negative
Precision: 0.9861809156036488 || Recall: 0.8600356920331382 || F1: 0.9187987831034805


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1136525  163487]
 [  15850    6999]]
Positive
Precision: 0.041053224311673686 || Recall: 0.30631537485229116 || F1: 0.07240282411358523
Negative
Precision: 0.9862457967241567 || Recall: 0.8742419300744916 || F1: 0.9268724715960408
---------------------------
Positive
Overall Precision: 0.03922340757826116 (+/- 0.0011272472366001308) || Overall Recall: 0.3220001843112618 (+/- 0.014554363086672479) || Overall F1: 0.0699007510792943 (+/- 0.0016274181438715313)
Negative
Overall Precision: 0.9863541412276506 (+/- 0.00016987504033723181) || Overall Recall: 0.8611769527182312 (+/- 0.00897725586401674) || Overall F1: 0.9194994256460829 (+/- 0.005064765204374765)


In [61]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(log_reg, X_gold_one_hot, y_gold, 
                                ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=False)

[[118  16]
 [  3   2]]
Positive
Precision: 0.1111111111111111 || Recall: 0.4 || F1: 0.1739130434782609
Negative
Precision: 0.9752066115702479 || Recall: 0.8805970149253731 || F1: 0.9254901960784313


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.2 Count with frequencies

In [62]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_count_cross_validation_two_class(log_reg, ['instance', 'class', 'id', 'pids'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1120769  179241]
 [  15277    7569]]
Positive
Precision: 0.04051710293881484 || Recall: 0.331305261314891 || F1: 0.07220399130003434
Negative
Precision: 0.9865524811495309 || Recall: 0.8621233682817825 || F1: 0.9201504398913654


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1143903  156107]
 [  15871    6975]]
Positive
Precision: 0.04276989489949841 || Recall: 0.3053050862295369 || F1: 0.07502904350070995
Negative
Precision: 0.9863154373179602 || Recall: 0.879918616010646 || F1: 0.9300841049458001


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1108088  191922]
 [  15315    7531]]
Positive
Precision: 0.03775826886534672 || Recall: 0.32964195045084477 || F1: 0.06775559044350177
Negative
Precision: 0.9863673143119611 || Recall: 0.8523688279320928 || F1: 0.9144854797758368


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1124171  175839]
 [  15974    6872]]
Positive
Precision: 0.03761130966389544 || Recall: 0.3007966383612011 || F1: 0.06686223285998531
Negative
Precision: 0.9859895013353565 || Recall: 0.8647402712286829 || F1: 0.9213931082246826


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1114815  185197]
 [  15675    7174]]
Positive
Precision: 0.037292523301329206 || Recall: 0.3139743533633857 || F1: 0.06666666666666667
Negative
Precision: 0.9861343311307487 || Recall: 0.8575420842269148 || F1: 0.9173536989477894
---------------------------
Positive
Overall Precision: 0.039189819933776925 (+/- 0.0021316371445811587) || Overall Recall: 0.31620465794397185 (+/- 0.01240782850965003) || Overall F1: 0.06970350495417961 (+/- 0.0033377650491023935)
Negative
Overall Precision: 0.9862718130491114 (+/- 0.0001941130249269795) || Overall Recall: 0.8633386335360239 (+/- 0.009298106211475511) || Overall F1: 0.9206933663570949 (+/- 0.005266148749900299)


In [63]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(log_reg, X_gold_one_hot_more, y_gold, 
                                ['instance', 'class', 'id', 'pids'], scale=False)

[[125   9]
 [  3   2]]
Positive
Precision: 0.18181818181818182 || Recall: 0.4 || F1: 0.25000000000000006
Negative
Precision: 0.9765625 || Recall: 0.9328358208955224 || F1: 0.9541984732824427


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.3 Scaled count with pids with frequencies

In [64]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_count_cross_validation_two_class(log_reg, ['instance', 'class', 'id', 'pids'], scale=True)

[[1007293  292717]
 [  15503    7343]]
Positive
Precision: 0.024471772312204227 || Recall: 0.32141293880766875 || F1: 0.04548072813760042
Negative
Precision: 0.9848425296931157 || Recall: 0.7748348089630079 || F1: 0.8673070415695499
[[999807 300203]
 [ 15505   7341]]
Positive
Precision: 0.023869755222017012 || Recall: 0.32132539613061367 || F1: 0.04443839099246345
Negative
Precision: 0.9847288321225397 || Recall: 0.7690763917200637 || F1: 0.863644020140611
[[999161 300849]
 [ 15698   7148]]
Positive
Precision: 0.023208018259918115 || Recall: 0.3128775277948 || F1: 0.04321082809671052
Negative
Precision: 0.9845318413690965 || Recall: 0.7685794724655964 || F1: 0.8632548969293726
[[994539 305471]
 [ 15747   7099]]
Positive
Precision: 0.02271171257638289 || Recall: 0.3107327322069509 || F1: 0.04232952512700647
Negative
Precision: 0.9844133245437431 || Recall: 0.7650241151991138 || F1: 0.8609624048173914
[[997487 302525]
 [ 15632   7217]]
Positive
Precision: 0.02330003680482466 || Recall: 0

In [65]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(log_reg, X_gold_one_hot_more, y_gold, 
                               ['instance', 'class', 'id', 'pids'], scale=True)

[[46 88]
 [ 1  4]]
Positive
Precision: 0.043478260869565216 || Recall: 0.8 || F1: 0.08247422680412371
Negative
Precision: 0.9787234042553191 || Recall: 0.34328358208955223 || F1: 0.5082872928176795


## 5.4 Top 10 patterns

In [66]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_count_cross_validation_two_class_frequency(log_reg, ['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10'])

[[1170567  129443]
 [  16848    5998]]
Positive
Precision: 0.04428496540929261 || Recall: 0.262540488488138 || F1: 0.07578638801670384
Negative
Precision: 0.9858111949065828 || Recall: 0.900429227467481 || F1: 0.9411877745057641
[[1160583  139427]
 [  16617    6229]]
Positive
Precision: 0.042765145273795796 || Recall: 0.2726516676879979 || F1: 0.07393384054788668
Negative
Precision: 0.9858843017329256 || Recall: 0.8927492865439497 || F1: 0.9370081664453155
[[1165467  134543]
 [  16757    6089]]
Positive
Precision: 0.04329740030718471 || Recall: 0.2665236802941434 || F1: 0.07449320397851698
Negative
Precision: 0.9858258671791471 || Recall: 0.8965061807216868 || F1: 0.9390468424814099
[[1177400  122610]
 [  17130    5716]]
Positive
Precision: 0.044542805043405076 || Recall: 0.2501969710233739 || F1: 0.07562246976953405
Negative
Precision: 0.9856596318217207 || Recall: 0.9056853408819933 || F1: 0.9439816559365655
[[1169230  130782]
 [  16900    5949]]
Positive
Precision: 0.043508787326941

In [67]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(log_reg, X_gold_top_10, y_gold, 
                               not_top10_columns)

[[100  34]
 [  2   3]]
Positive
Precision: 0.08108108108108109 || Recall: 0.6 || F1: 0.14285714285714288
Negative
Precision: 0.9803921568627451 || Recall: 0.746268656716418 || F1: 0.847457627118644


## 5.5 Count scaled

In [68]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_count_cross_validation_two_class(log_reg, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[1213452   86558]
 [  19326    3520]]
Positive
Precision: 0.0390772441661671 || Recall: 0.15407511161691326 || F1: 0.062342814636392616
Negative
Precision: 0.9843232114784657 || Recall: 0.933417435250498 || F1: 0.9581946850664169
[[1211399   88611]
 [  19225    3621]]
Positive
Precision: 0.03925969294821754 || Recall: 0.158496016808194 || F1: 0.06293122925320217
Negative
Precision: 0.9843778440856021 || Recall: 0.9318382166291028 || F1: 0.9573877534246359
[[1192148  107862]
 [  18573    4273]]
Positive
Precision: 0.03810585455031881 || Recall: 0.18703492952814496 || F1: 0.06331261436794809
Negative
Precision: 0.984659554100408 || Recall: 0.9170298690010077 || F1: 0.9496421560095446
[[1211714   88296]
 [  19323    3523]]
Positive
Precision: 0.038368965029024496 || Recall: 0.15420642563249584 || F1: 0.06144856756638904
Negative
Precision: 0.9843034774746819 || Recall: 0.9320805224575196 || F1: 0.9574804418882779
[[1205465   94547]
 [  19107    3742]]
Positive
Precision: 0.03807140168279

In [69]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(log_reg, X_gold_one_hot, y_gold, 
                                ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[37 97]
 [ 1  4]]
Positive
Precision: 0.039603960396039604 || Recall: 0.8 || F1: 0.07547169811320754
Negative
Precision: 0.9736842105263158 || Recall: 0.27611940298507465 || F1: 0.4302325581395348


# 6. XGBoost

## 6.1 Only count

In [70]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(xg, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'])

[[659395 640615]
 [  5138  17708]]
Positive
Precision: 0.026898650054760352 || Recall: 0.7751028626455397 || F1: 0.0519929709073666
Negative
Precision: 0.9922682545486831 || Recall: 0.5072230213613741 || F1: 0.6712960724198961
[[670980 629030]
 [  5373  17473]]
Positive
Precision: 0.02702694341712258 || Recall: 0.7648165980915697 || F1: 0.05220893734061006
Negative
Precision: 0.9920559234600866 || Recall: 0.516134491273144 || F1: 0.6790048184468136
[[668859 631151]
 [  5331  17515]]
Positive
Precision: 0.027001569374685893 || Recall: 0.766654994309726 || F1: 0.05216585854013033
Negative
Precision: 0.9920927335024251 || Recall: 0.514502965361805 || F1: 0.6776000405227435
[[657611 642399]
 [  5027  17819]]
Positive
Precision: 0.02698957011169038 || Recall: 0.7799614812220957 || F1: 0.05217373481840648
Negative
Precision: 0.9924136557215252 || Recall: 0.5058507242251983 || F1: 0.6701262783749302
[[670674 629338]
 [  5295  17554]]
Positive
Precision: 0.027135905220655072 || Recall: 0.76826

In [71]:
# get performance on goldstandard
util_ml.get_count_cross_validation_two_class(xg, ['instance', 'class', 'id', 'pids'])

[[668385 631625]
 [  5275  17571]]
Positive
Precision: 0.027065785987590806 || Recall: 0.7691061892672678 || F1: 0.05229137464622747
Negative
Precision: 0.9921696404714544 || Recall: 0.5141383527819017 || F1: 0.6773016765720713
[[682999 617011]
 [  5572  17274]]
Positive
Precision: 0.027233814452493753 || Recall: 0.7561061017245907 || F1: 0.05257399209594434
Negative
Precision: 0.9919078787808374 || Recall: 0.5253798047707325 || F1: 0.686920975308524
[[681189 618821]
 [  5471  17375]]
Positive
Precision: 0.02731076586460776 || Recall: 0.7605270069158715 || F1: 0.05272805071603934
Negative
Precision: 0.9920324469169604 || Recall: 0.5239875077884016 || F1: 0.6857595876516986
[[678843 621167]
 [  5359  17487]]
Positive
Precision: 0.027381023214447887 || Recall: 0.7654293968309551 || F1: 0.05287074829931973
Negative
Precision: 0.9921675177798369 || Recall: 0.5221829062853363 || F1: 0.6842444254948564
[[678507 621505]
 [  5402  17447]]
Positive
Precision: 0.02730565050269817 || Recall: 0.76

## 6.2 Count with frequencies

In [72]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(xg, ['instance', 'class', 'id', 'pids'])

[[668385 631625]
 [  5275  17571]]
Positive
Precision: 0.027065785987590806 || Recall: 0.7691061892672678 || F1: 0.05229137464622747
Negative
Precision: 0.9921696404714544 || Recall: 0.5141383527819017 || F1: 0.6773016765720713
[[682999 617011]
 [  5572  17274]]
Positive
Precision: 0.027233814452493753 || Recall: 0.7561061017245907 || F1: 0.05257399209594434
Negative
Precision: 0.9919078787808374 || Recall: 0.5253798047707325 || F1: 0.686920975308524
[[681189 618821]
 [  5471  17375]]
Positive
Precision: 0.02731076586460776 || Recall: 0.7605270069158715 || F1: 0.05272805071603934
Negative
Precision: 0.9920324469169604 || Recall: 0.5239875077884016 || F1: 0.6857595876516986
[[678843 621167]
 [  5359  17487]]
Positive
Precision: 0.027381023214447887 || Recall: 0.7654293968309551 || F1: 0.05287074829931973
Negative
Precision: 0.9921675177798369 || Recall: 0.5221829062853363 || F1: 0.6842444254948564
[[678507 621505]
 [  5402  17447]]
Positive
Precision: 0.02730565050269817 || Recall: 0.76

In [73]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(xg, X_gold_one_hot_more, y_gold, 
                                ['instance', 'class', 'id', 'pids'], scale=False)

[[91 43]
 [ 2  3]]
Positive
Precision: 0.06521739130434782 || Recall: 0.6 || F1: 0.11764705882352941
Negative
Precision: 0.978494623655914 || Recall: 0.6791044776119403 || F1: 0.8017621145374448


## 6.3 Scaled count pids with frequencies

In [74]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(xg, ['instance', 'class', 'id', 'pids'], scale=True)

[[916192 383818]
 [ 10574  12272]]
Positive
Precision: 0.030982857431391855 || Recall: 0.5371618664098748 || F1: 0.05858651440792866
Negative
Precision: 0.9885904316731516 || Recall: 0.7047576557103407 || F1: 0.8228865409003869
[[612581 687429]
 [  4807  18039]]
Positive
Precision: 0.02557025974246883 || Recall: 0.7895911756981528 || F1: 0.0495363263647273
Negative
Precision: 0.9922139724128101 || Recall: 0.47121252913439127 || F1: 0.6389711473569911
[[612809 687201]
 [  5159  17687]]
Positive
Precision: 0.025091929498019545 || Recall: 0.7741836645364615 || F1: 0.048608420109545525
Negative
Precision: 0.9916516712839499 || Recall: 0.47138791240067385 || F1: 0.6390156717126056
[[602206 697804]
 [  5173  17673]]
Positive
Precision: 0.024701003666085703 || Recall: 0.7735708657970761 || F1: 0.04787335624110315
Negative
Precision: 0.9914830772878219 || Recall: 0.46323182129368234 || F1: 0.6314453947254597
[[595085 704927]
 [  4878  17971]]
Positive
Precision: 0.024859662082340802 || Recall:

In [75]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(xg, X_gold_one_hot_more, y_gold, 
                               ['instance', 'class', 'id', 'pids'], scale=True)

[[100  34]
 [  2   3]]
Positive
Precision: 0.08108108108108109 || Recall: 0.6 || F1: 0.14285714285714288
Negative
Precision: 0.9803921568627451 || Recall: 0.746268656716418 || F1: 0.847457627118644


## 6.4 Top 10 patterns

In [76]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class_frequency(xg, ['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10'])

[[606813 693197]
 [  5170  17676]]
Positive
Precision: 0.02486520095713299 || Recall: 0.7737021798126587 || F1: 0.04818193341047459
Negative
Precision: 0.9915520529165026 || Recall: 0.46677564018738316 || F1: 0.6347439556525574
[[607497 692513]
 [  5215  17631]]
Positive
Precision: 0.024827358958183127 || Recall: 0.7717324695789197 || F1: 0.048107068309253875
Negative
Precision: 0.9914886602514722 || Recall: 0.4673017899862309 || F1: 0.6352172453707334
[[608554 691456]
 [  5253  17593]]
Positive
Precision: 0.024812107484814168 || Recall: 0.7700691587148735 || F1: 0.048075202044009044
Negative
Precision: 0.9914419353314641 || Recall: 0.46811486065491803 || F1: 0.6359584014563565
[[616709 683301]
 [  5311  17535]]
Positive
Precision: 0.025020118829512182 || Recall: 0.7675304210802766 || F1: 0.04846051166119926
Negative
Precision: 0.9914616893347481 || Recall: 0.4743878893239283 || F1: 0.6417267160242035
[[618959 681053]
 [  5349  17500]]
Positive
Precision: 0.025051785619702442 || Recall

## 6.5 Count scaled

In [78]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_count_cross_validation_two_class(xg, ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[858153 441857]
 [  9471  13375]]
Positive
Precision: 0.02938062350625615 || Recall: 0.5854416528057428 || F1: 0.05595321265567543
Negative
Precision: 0.9890839810793616 || Recall: 0.6601126145183499 || F1: 0.791787728002052
[[566627 733383]
 [  4188  18658]]
Positive
Precision: 0.02480981754984103 || Recall: 0.8166856342466953 || F1: 0.04815669897675403
Negative
Precision: 0.9926631220272768 || Recall: 0.4358635702802286 || F1: 0.6057509387569655
[[589380 710630]
 [  4722  18124]]
Positive
Precision: 0.024869846340466056 || Recall: 0.793311739472993 || F1: 0.048227780734433214
Negative
Precision: 0.9920518698809295 || Recall: 0.45336574334043583 || F1: 0.6223285634640401
[[593526 706484]
 [  5005  17841]]
Positive
Precision: 0.024631208366410105 || Recall: 0.7809244506697015 || F1: 0.04775613614554098
Negative
Precision: 0.9916378600273001 || Recall: 0.45655494957731096 || F1: 0.6252443323583742
[[589741 710271]
 [  4723  18126]]
Positive
Precision: 0.024884781238802466 || Recall: 0.

In [79]:
# get performance on goldstandard
util_ml.get_count_performance_on_goldstandard(xg, X_gold_one_hot, y_gold, 
                                ['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], scale=True)

[[105  29]
 [  2   3]]
Positive
Precision: 0.09375 || Recall: 0.6 || F1: 0.16216216216216214
Negative
Precision: 0.9813084112149533 || Recall: 0.7835820895522388 || F1: 0.8713692946058091


## 6.6 Grid Search

In [27]:
def get_cross_validated_two_class_confusion_matrix(model, X_train, y_train, folds=5):
    """Get cross validated confusion matrix for two class problem"""
    # k-fold
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=88)
    kf.get_n_splits(X_train.values)
    
    overall_precision = []
    overall_recall = []
    overall_f1 = []
    
    for train_index, test_index in kf.split(X_train.values, y_train):
        # print("TRAIN:", train_index, "TEST:", test_index) # for debugging
        X_tr, X_te = X_train.values[train_index], X_train.values[test_index]
        y_tr, y_te = y_train[train_index], y_train[test_index]
        
        # fit model
        model.fit(X_tr, y_tr)
        
        # predicted score
        y_predict = model.predict(X_te)
        f1 = f1_score(y_te, y_predict)
        precision = precision_score(y_te, y_predict)
        recall = recall_score(y_te, y_predict)
        
        print(confusion_matrix(y_te, y_predict))
        print('Precision: {} || Recall: {} || F1: {}'.format(precision, recall, f1))
        
        overall_precision.append(precision)
        overall_recall.append(recall)
        overall_f1.append(f1)
    
    print('---------------------------')
    print('Overall Precision: {} (+/- {}) || Overall Recall: {} (+/- {}) || Overall F1: {} (+/- {})'.format(
                                                                            np.mean(overall_precision), np.std(overall_precision, axis=0),
                                                                           np.mean(overall_recall), np.std(overall_recall, axis=0),
                                                                            np.mean(overall_f1), np.std(overall_f1, axis=0)))
    
    return np.mean(overall_f1)

In [38]:
def get_cross_validation_two_class(model, X_columns_drop, scale=False):
    """Get cross validated confusion matrix for two class problem"""
    
    # positive scores
    overall_precision = []
    overall_recall = []
    overall_f1 = []
    
    # negative scores
    overall_negative_precision = []
    overall_negative_recall = []
    overall_negative_f1 = []
    
    # loop through all folds
    
    for x in range(5):
        
        # load data
        train = pd.read_csv('/path/to/9_FINAL/data/machine_learning/two_class/one-hot-ecoding/train/cross_validation/training/training_'+str(x)+'.csv',sep=";")
        # debug
        train_id = train['id']
        del train['Unnamed: 0']
        del train['Unnamed: 0.1']
        y_train = train['label']
        del train['label']
        X_train = train.drop(X_columns_drop, axis=1)
        
        validation = pd.read_csv('/path/to/9_FINAL/data/machine_learning/two_class/one-hot-ecoding/train/cross_validation/validation/val_fold_'+str(x)+'.csv',sep=";")
        # print(validation[validation.id.isin(train_id)])
        del validation['Unnamed: 0']
        y_validation = validation['label']
        del validation['label']
        X_validation = validation.drop(X_columns_drop, axis=1)
        
        # scale if necesary
        if scale:
            min_max_scaler = preprocessing.MinMaxScaler()
            X_train = pd.DataFrame(min_max_scaler.fit_transform(X_train))
            X_validation = pd.DataFrame(min_max_scaler.fit_transform(X_validation))
        
        # fit model
        model.fit(X_train, y_train)
        
        # predicted score
        y_predict = model.predict(X_validation)
        f1 = f1_score(y_validation, y_predict)
        precision = precision_score(y_validation, y_predict)
        recall = recall_score(y_validation, y_predict)
        conf_matrix = confusion_matrix(y_validation, y_predict)
        print(conf_matrix)
        print('Positive')
        print('Precision: {} || Recall: {} || F1: {}'.format(precision, recall, f1))
        
        overall_precision.append(precision)
        overall_recall.append(recall)
        overall_f1.append(f1)
        
        # for negative class (noise), flip labels
        
        precision = conf_matrix[0,0] / (conf_matrix[0,0] + conf_matrix[1,0])
        recall = conf_matrix[0,0] / (conf_matrix[0,0] + conf_matrix[0,1])
        f1 = (2*precision*recall)/(precision+recall)
        
        print('Negative')
        print('Precision: {} || Recall: {} || F1: {}'.format(precision, recall, f1))
        
        overall_negative_precision.append(precision)
        overall_negative_recall.append(recall)
        overall_negative_f1.append(f1)
    
    print('---------------------------')
    print('Positive')
    print('Overall Precision: {} (+/- {}) || Overall Recall: {} (+/- {}) || Overall F1: {} (+/- {})'.format(
                                                                            np.mean(overall_precision), np.std(overall_precision, axis=0),
                                                                           np.mean(overall_recall), np.std(overall_recall, axis=0),
                                                                            np.mean(overall_f1), np.std(overall_f1, axis=0)))
    
    print('Negative')
    print('Overall Precision: {} (+/- {}) || Overall Recall: {} (+/- {}) || Overall F1: {} (+/- {})'.format(
                                                                            np.mean(overall_negative_precision), np.std(overall_negative_precision, axis=0),
                                                                           np.mean(overall_negative_recall), np.std(overall_negative_recall, axis=0),
                                                                            np.mean(overall_negative_f1), np.std(overall_negative_f1, axis=0)))
    
    return np.mean(overall_f1)

In [40]:
# initialize parameters
learning_rate = [0.05, 0.15, 0.25]
max_depth = [3,8,12]
min_child_weight = [1,3,5]
gamma = [0, 0.2, 0.4]
colsample_bytree = [0.3, 0.5, 0.7]

parameters = []

# loop through all parameters
for rate in learning_rate:
    for depth in max_depth:
        for weight in min_child_weight:
            for gam in gamma:
                for bytree in colsample_bytree:
                    parameter_combination = [rate, depth, weight, gam, bytree]
                    
                    parameters.append(parameter_combination)

In [41]:
# loop through parameter combination and train model
max_f1 = 0
# best parameter combination
best_parameters = None

for parameter in parameters:
    print(parameter)
    
    xg = xgb.XGBClassifier(random_state=88, learning_rate=parameter[0], 
                          max_depth=parameter[1], min_child_weight=parameter[2], gamma=parameter[3],
                          colsample_bytree=parameter[4])
    f1 = get_cross_validation_two_class(xg, ['instance', 'class', 'id', 'pids'])
    
    if f1 > max_f1:
        best_parameters=parameter
        max_f1=f1

[0.05, 3, 1, 0, 0.3]
[[656053 643957]
 [  5560  17286]]
Positive
Precision: 0.026141675601858923 || Recall: 0.7566313577869211 || F1: 0.050537283891423486
Negative
Precision: 0.9915962957197032 || Recall: 0.5046522719056007 || F1: 0.668887956554343
[[658852 641158]
 [  5627  17219]]
Positive
Precision: 0.026153708285678267 || Recall: 0.7536986781055764 || F1: 0.05055319623676828
Negative
Precision: 0.991531711310666 || Recall: 0.5068053322666749 || F1: 0.6707617095336244
[[664974 635036]
 [  5778  17068]]
Positive
Precision: 0.02617373915817109 || Recall: 0.7470892059879191 || F1: 0.050575598192458694
Negative
Precision: 0.9913857878917991 || Recall: 0.5115145268113322 || F1: 0.6748394783337613
[[656101 643909]
 [  5548  17298]]
Positive
Precision: 0.02616124753670182 || Recall: 0.7571566138492515 || F1: 0.05057502854310997
Negative
Precision: 0.9916148894655625 || Recall: 0.5046891946985024 || F1: 0.6689246194165245
[[664817 635195]
 [  5721  17128]]
Positive
Precision: 0.026256930998

[[669013 630997]
 [  5835  17011]]
Positive
Precision: 0.026251219120751595 || Recall: 0.7445942396918498 || F1: 0.050714462461280696
Negative
Precision: 0.9913536085162881 || Recall: 0.5146214259890308 || F1: 0.6775302325534291
[[669007 631003]
 [  5846  17000]]
Positive
Precision: 0.026234446445463987 || Recall: 0.744112754968047 || F1: 0.05068204618326926
Negative
Precision: 0.991337372731543 || Recall: 0.5146168106399182 || F1: 0.6775224407971592
[[625258 674752]
 [  4894  17952]]
Positive
Precision: 0.025915831293019817 || Recall: 0.7857830692462575 || F1: 0.050176787086856266
Negative
Precision: 0.9922336198250581 || Recall: 0.48096399258467243 || F1: 0.6478813695430746
[[666525 633487]
 [  5791  17058]]
Positive
Precision: 0.026221091546318857 || Recall: 0.7465534596700074 || F1: 0.050662762067972086
Negative
Precision: 0.9913864908763141 || Recall: 0.5127068057833313 || F1: 0.675876426233365
---------------------------
Positive
Overall Precision: 0.02616586187038213 (+/- 0.0001

[[667620 632390]
 [  5842  17004]]
Positive
Precision: 0.026184411928659643 || Recall: 0.744287840322157 || F1: 0.05058907533023921
Negative
Precision: 0.9913254199940011 || Recall: 0.5135498957700325 || F1: 0.676594347424235
[[626403 673607]
 [  4939  17907]]
Positive
Precision: 0.025895354251685432 || Recall: 0.7838133590125186 || F1: 0.05013438602385352
Negative
Precision: 0.9921769817309795 || Recall: 0.48184475504034585 || F1: 0.6486678761820734
[[669436 630576]
 [  5828  17021]]
Positive
Precision: 0.02628332126306947 || Recall: 0.7449341327848046 || F1: 0.05077515564266175
Negative
Precision: 0.9913693014880106 || Recall: 0.5149460158829303 || F1: 0.6778151508953685
---------------------------
Positive
Overall Precision: 0.026109002153378462 (+/- 0.00016819443574344128) || Overall Recall: 0.7596845416930897 (+/- 0.018391558139023916) || Overall F1: 0.0504814951306529 (+/- 0.0002753741797627005)
Negative
Overall Precision: 0.9916674505258094 (+/- 0.00038656010507291373) || Overal

[[656093 643917]
 [  5548  17298]]
Positive
Precision: 0.026160931013361765 || Recall: 0.7571566138492515 || F1: 0.05057443707505617
Negative
Precision: 0.9916147880799406 || Recall: 0.5046830408996854 || F1: 0.6689191910283736
[[664988 635024]
 [  5724  17125]]
Positive
Precision: 0.02625933643998534 || Recall: 0.7494857542999693 || F1: 0.050740891084121734
Negative
Precision: 0.9914657856129009 || Recall: 0.5115245090045323 || F1: 0.6748666987360991
---------------------------
Positive
Overall Precision: 0.026177954381714486 (+/- 4.188465639888813e-05) || Overall Recall: 0.7529173732183937 (+/- 0.003980338821860138) || Overall F1: 0.05059666014532599 (+/- 7.332319559983809e-05)
Negative
Overall Precision: 0.9915213424288897 (+/- 8.532471688311079e-05) || Overall Recall: 0.5077687059254902 (+/- 0.0031275925383970407) || Overall F1: 0.671596692888855 (+/- 0.00271545488646886)
[0.05, 3, 3, 0.4, 0.5]
[[629855 670155]
 [  5014  17832]]
Positive
Precision: 0.02591909440149305 || Recall: 0.

[[667294 632718]
 [  5799  17050]]
Positive
Precision: 0.02624013494047106 || Recall: 0.7462033349380717 || F1: 0.05069749946849396
Negative
Precision: 0.9913845486433525 || Recall: 0.5132983387845651 || F1: 0.6763897511789793
---------------------------
Positive
Overall Precision: 0.026155885411405178 (+/- 0.00012331138964465885) || Overall Recall: 0.7544757190755073 (+/- 0.01572436760552759) || Overall F1: 0.05055797557321725 (+/- 0.00019719709635416564)
Negative
Overall Precision: 0.991557682040356 (+/- 0.0003392036024248635) || Overall Recall: 0.5062657938636353 (+/- 0.01272606705140263) || Overall F1: 0.6701963670849305 (+/- 0.011221264742940138)
[0.05, 3, 5, 0.2, 0.3]
[[656106 643904]
 [  5560  17286]]
Positive
Precision: 0.026143771079417413 || Recall: 0.7566313577869211 || F1: 0.0505411995859867
Negative
Precision: 0.9915969688634447 || Recall: 0.5046930408227629 || F1: 0.668923920158069
[[658347 641663]
 [  5615  17231]]
Positive
Precision: 0.026151399162839546 || Recall: 0.75

[[665419 634591]
 [  5756  17090]]
Positive
Precision: 0.026224487134042577 || Recall: 0.7480521754355248 || F1: 0.050672545354003624
Negative
Precision: 0.991423995232242 || Recall: 0.5118568318705241 || F1: 0.6751461684215334
[[663720 636290]
 [  5752  17094]]
Positive
Precision: 0.02616225680457434 || Recall: 0.748227260789635 || F1: 0.05055676323144492
Negative
Precision: 0.9914081544859232 || Recall: 0.5105499188467781 || F1: 0.6740046367522019
[[669064 630946]
 [  5846  17000]]
Positive
Precision: 0.02623675429742602 || Recall: 0.744112754968047 || F1: 0.05068635284857303
Negative
Precision: 0.9913381043398379 || Recall: 0.5146606564564888 || F1: 0.6775606100500273
[[625257 674753]
 [  4894  17952]]
Positive
Precision: 0.025915793880511906 || Recall: 0.7857830692462575 || F1: 0.050176716963570726
Negative
Precision: 0.9922336075004246 || Recall: 0.48096322335982034 || F1: 0.6478806690219107
[[667294 632718]
 [  5799  17050]]
Positive
Precision: 0.02624013494047106 || Recall: 0.74

[[654748 645262]
 [  5215  17631]]
Positive
Precision: 0.026597052616334764 || Recall: 0.7717324695789197 || F1: 0.05142189666914089
Negative
Precision: 0.9920980418599225 || Recall: 0.5036484334735887 || F1: 0.6681194077673519
[[646828 653182]
 [  5082  17764]]
Positive
Precision: 0.026476050233550836 || Recall: 0.7775540576030815 || F1: 0.05120843134541765
Negative
Precision: 0.9922044453989047 || Recall: 0.4975561726448258 || F1: 0.6627607688839706
[[652415 647595]
 [  5132  17714]]
Positive
Precision: 0.026625222265142964 || Recall: 0.7753654906767049 || F1: 0.05148258749845602
Negative
Precision: 0.99219523471326 || Recall: 0.5018538318936008 || F1: 0.6665604117785586
[[649785 650227]
 [  5123  17726]]
Positive
Precision: 0.026537795323922492 || Recall: 0.7757888747866427 || F1: 0.05132005987243812
Negative
Precision: 0.992177527225198 || Recall: 0.4998300015692163 || F1: 0.6647688907986005
---------------------------
Positive
Overall Precision: 0.026547329911050466 (+/- 5.6394292

[[653950 646060]
 [  5232  17614]]
Positive
Precision: 0.026540138682545948 || Recall: 0.7709883568239517 || F1: 0.05131387286604906
Negative
Precision: 0.9920628900667797 || Recall: 0.5030345920415997 || F1: 0.6675711211560685
[[661081 638929]
 [  5269  17577]]
Positive
Precision: 0.026773555763389825 || Recall: 0.769368817298433 || F1: 0.05174637007030229
Negative
Precision: 0.9920927440534254 || Recall: 0.5085199344620426 || F1: 0.6723906100612298
[[654601 645411]
 [  5228  17621]]
Positive
Precision: 0.026576394502829427 || Recall: 0.771193487679986 || F1: 0.05138209106244377
Negative
Precision: 0.9920767350328645 || Recall: 0.5035345827576976 || F1: 0.6680143950453123
---------------------------
Positive
Overall Precision: 0.026590146073915216 (+/- 0.0001057707716884693) || Overall Recall: 0.771554639057489 (+/- 0.00261890879106036) || Overall F1: 0.05140851319120787 (+/- 0.0001930868018170048)
Negative
Overall Precision: 0.9920912484607387 (+/- 4.3302375269150945e-05) || Overall 

[[648854 651156]
 [  5041  17805]]
Positive
Precision: 0.026615901375416504 || Recall: 0.7793486824827103 || F1: 0.05147389373047685
Negative
Precision: 0.992290811215868 || Recall: 0.4991146221952139 || F1: 0.66416125656058
[[661109 638903]
 [  5316  17533]]
Positive
Precision: 0.02670938217891767 || Recall: 0.7673421156286927 || F1: 0.051621925995716085
Negative
Precision: 0.9920231083767865 || Recall: 0.5085406903936271 || F1: 0.672392759086612
---------------------------
Positive
Overall Precision: 0.026626917626295675 (+/- 7.416718222565969e-05) || Overall Recall: 0.7723163614956938 (+/- 0.0052283493383891225) || Overall F1: 0.05147887173376198 (+/- 0.00012981796183895582)
Negative
Overall Precision: 0.9921215355929733 (+/- 0.00011179533296761595) || Overall Recall: 0.5038210449025191 (+/- 0.004542530497825398) || Overall F1: 0.6682641617349417 (+/- 0.003975951148050112)
[0.05, 8, 3, 0.4, 0.3]
[[647116 652894]
 [  5101  17745]]
Positive
Precision: 0.026459839049026376 || Recall: 0

[[649808 650204]
 [  5127  17722]]
Positive
Precision: 0.02653287939083072 || Recall: 0.7756138124206748 || F1: 0.05131048460062972
Negative
Precision: 0.9921717422339621 || Recall: 0.4998476937135965 || F1: 0.6647832396479292
---------------------------
Positive
Overall Precision: 0.026543593447603337 (+/- 6.16327813151001e-05) || Overall Recall: 0.7754676806317319 (+/- 0.0020243884148890058) || Overall F1: 0.05133016488278119 (+/- 0.00011232466214546596)
Negative
Overall Precision: 0.992173041054173 (+/- 3.9786360757854316e-05) || Overall Recall: 0.5001963062291234 (+/- 0.00228215940117649) || Overall F1: 0.6650886522314513 (+/- 0.0020105920912897772)
[0.05, 8, 5, 0, 0.7]
[[647123 652887]
 [  5086  17760]]
Positive
Precision: 0.0264818898764924 || Recall: 0.7773789722489713 || F1: 0.051218974092024
Negative
Precision: 0.9922018862051888 || Recall: 0.4977830939762002 || F1: 0.662961481268239
[[656489 643521]
 [  5245  17601]]
Positive
Precision: 0.026622922849337944 || Recall: 0.77041

[[647415 652595]
 [  5077  17769]]
Positive
Precision: 0.026506494978847314 || Recall: 0.7777729142957192 || F1: 0.05126585017527157
Negative
Precision: 0.9922190616896452 || Recall: 0.4980077076330182 || F1: 0.6631644935574971
[[654411 645599]
 [  5215  17631]]
Positive
Precision: 0.026583538139107098 || Recall: 0.7717324695789197 || F1: 0.05139663827331083
Negative
Precision: 0.9920940047845294 || Recall: 0.5033892046984254 || F1: 0.6678903633123703
[[654597 645413]
 [  5240  17606]]
Positive
Precision: 0.026554291807625422 || Recall: 0.7706381861157314 || F1: 0.05133954932822057
Negative
Precision: 0.9920586447865155 || Recall: 0.503532280520919 || F1: 0.6680082679923482
[[653156 646854]
 [  5147  17699]]
Positive
Precision: 0.026632939735431186 || Recall: 0.7747089205987919 || F1: 0.05149556516666448
Negative
Precision: 0.9921814119030294 || Recall: 0.5024238275090191 || F1: 0.6670598622385696
[[649291 650721]
 [  5133  17716]]
Positive
Precision: 0.026503619638051156 || Recall: 0.

[[660268 639742]
 [  5328  17518]]
Positive
Precision: 0.02665307488665064 || Recall: 0.7667863083253086 || F1: 0.05151549905455914
Negative
Precision: 0.9919951442015877 || Recall: 0.5078945546572718 || F1: 0.6718213110867588
[[655466 644544]
 [  5224  17622]]
Positive
Precision: 0.02661266208171365 || Recall: 0.7713385275321719 || F1: 0.051450193573251264
Negative
Precision: 0.9920931147739485 || Recall: 0.5042007369174083 || F1: 0.668604069975009
[[663046 636964]
 [  5298  17548]]
Positive
Precision: 0.026810814774977387 || Recall: 0.7680994484811345 || F1: 0.05181307373648794
Negative
Precision: 0.9920729444717091 || Recall: 0.5100314612964516 || F1: 0.6737060508424806
[[655622 644390]
 [  5232  17617]]
Positive
Precision: 0.026611501086846515 || Recall: 0.7710184253140181 || F1: 0.051447311551625456
Negative
Precision: 0.9920829714278797 || Recall: 0.504319960123445 || F1: 0.668706581683807
---------------------------
Positive
Overall Precision: 0.02665038662819268 (+/- 8.50446784

[[662756 637254]
 [  5372  17474]]
Positive
Precision: 0.026688945638494152 || Recall: 0.7648603694300972 || F1: 0.051578130211607884
Negative
Precision: 0.9919596245030893 || Recall: 0.5098083860893378 || F1: 0.6734852942222547
[[652980 647030]
 [  5116  17730]]
Positive
Precision: 0.026671279860400746 || Recall: 0.7760658320931454 || F1: 0.0515702306262598
Negative
Precision: 0.9922260582042741 || Recall: 0.5022884439350467 || F1: 0.6669506145224007
[[665983 634029]
 [  5382  17467]]
Positive
Precision: 0.026810602060488475 || Recall: 0.7644535865902228 || F1: 0.05180434347403777
Negative
Precision: 0.991983496309757 || Recall: 0.5122898865548934 || F1: 0.6756526022166232
---------------------------
Positive
Overall Precision: 0.02669881587835301 (+/- 6.521058702947667e-05) || Overall Recall: 0.7681406516610367 (+/- 0.005049526322675372) || Overall F1: 0.051603879424497276 (+/- 0.00011512670547960535)
Negative
Overall Precision: 0.9920415617878711 (+/- 0.00011154124007377001) || Over

[[655575 644435]
 [  5138  17708]]
Positive
Precision: 0.02674346780076207 || Recall: 0.7751028626455397 || F1: 0.051703020048497125
Negative
Precision: 0.9922235524350209 || Recall: 0.504284582426289 || F1: 0.6687074104807257
[[668421 631591]
 [  5441  17408]]
Positive
Precision: 0.02682284564382996 || Recall: 0.7618714166921966 || F1: 0.051821245281670855
Negative
Precision: 0.991925646497354 || Recall: 0.5141652538591951 || F1: 0.6772681538943214
---------------------------
Positive
Overall Precision: 0.026723696281474384 (+/- 6.053854031010201e-05) || Overall Recall: 0.7686747298060923 (+/- 0.004874308333908129) || Overall F1: 0.05165157048491015 (+/- 0.00010723008515516935)
Negative
Overall Precision: 0.9920616957008981 (+/- 0.0001091347487707344) || Overall Recall: 0.5080027033129733 (+/- 0.0038549450203833506) || Overall F1: 0.6719222914880469 (+/- 0.0033451996086302175)
[0.05, 12, 3, 0.2, 0.7]
[[653965 646045]
 [  5180  17666]]
Positive
Precision: 0.02661700649831026 || Recall:

[[654509 645503]
 [  5221  17628]]
Positive
Precision: 0.02658298284954255 || Recall: 0.7714998468204298 || F1: 0.051395084404793144
Negative
Precision: 0.992086156457945 || Recall: 0.5034638141801768 || F1: 0.6679542511208107
---------------------------
Positive
Overall Precision: 0.026656928730425595 (+/- 0.00010049386576144107) || Overall Recall: 0.7693222927467349 (+/- 0.0018324074210427756) || Overall F1: 0.051528347425142174 (+/- 0.000184490359969977)
Negative
Overall Precision: 0.9920568849680673 (+/- 2.7488072761410875e-05) || Overall Recall: 0.5063167195748296 (+/- 0.002930518296349023) || Overall F1: 0.6704485611608116 (+/- 0.0025626285522791733)
[0.05, 12, 5, 0, 0.5]
[[653735 646275]
 [  5192  17654]]
Positive
Precision: 0.026590192626018746 || Recall: 0.772739210365053 || F1: 0.05141130646863966
Negative
Precision: 0.9921205232142559 || Recall: 0.5028692086983946 || F1: 0.6674385138470507
[[662741 637269]
 [  5342  17504]]
Positive
Precision: 0.02673292881655169 || Recall: 

[[656094 643916]
 [  5264  17582]]
Positive
Precision: 0.026579067510408196 || Recall: 0.7695876739910706 || F1: 0.05138351472358931
Negative
Precision: 0.9920406194527018 || Recall: 0.5046838101245376 || F1: 0.6690167271006766
[[660472 639538]
 [  5316  17530]]
Positive
Precision: 0.02667912605696823 || Recall: 0.7673115643876389 || F1: 0.051565345028930135
Negative
Precision: 0.9920154763978924 || Recall: 0.5080514765271036 || F1: 0.6719632434258249
[[656268 643742]
 [  5238  17608]]
Positive
Precision: 0.02662432902396613 || Recall: 0.7707257287927864 || F1: 0.05147063122263211
Negative
Precision: 0.9920817044743359 || Recall: 0.5048176552488057 || F1: 0.669143662350957
[[665250 634760]
 [  5330  17516]]
Positive
Precision: 0.026853663173257946 || Recall: 0.7666987656482536 || F1: 0.051889880643794746
Negative
Precision: 0.9920516567747323 || Recall: 0.5117268328705163 || F1: 0.6751784998401494
[[654664 645348]
 [  5222  17627]]
Positive
Precision: 0.026587729552396393 || Recall: 0.

[[648713 651297]
 [  5166  17680]]
Positive
Precision: 0.026428412337045966 || Recall: 0.7738772651667688 || F1: 0.05111133917201365
Negative
Precision: 0.9920994557096955 || Recall: 0.49900616149106547 || F1: 0.6640223676984721
[[653871 646139]
 [  5269  17577]]
Positive
Precision: 0.02648271248546065 || Recall: 0.769368817298433 || F1: 0.05120295035262657
Negative
Precision: 0.9920062505689231 || Recall: 0.5029738232782824 || F1: 0.6675047852384962
[[654034 645976]
 [  5207  17639]]
Positive
Precision: 0.02658017073152355 || Recall: 0.77208264028714 || F1: 0.051391120544357224
Negative
Precision: 0.9921015228118397 || Recall: 0.5030992069291774 || F1: 0.6676367652740767
[[650607 649405]
 [  5199  17650]]
Positive
Precision: 0.02645958729040334 || Recall: 0.7724626898332531 || F1: 0.05116653911268814
Negative
Precision: 0.9920723506646782 || Recall: 0.5004623034248915 || F1: 0.6653042358747083
---------------------------
Positive
Overall Precision: 0.026454944836603815 (+/- 8.29050302

[[649917 650093]
 [  5221  17625]]
Positive
Precision: 0.026395873707163803 || Recall: 0.7714698415477546 || F1: 0.05104523259248962
Negative
Precision: 0.992030686664489 || Recall: 0.49993230821301377 || F1: 0.664826396774055
[[649851 650159]
 [  5131  17715]]
Positive
Precision: 0.026524464195342237 || Recall: 0.7754092620152324 || F1: 0.051294301598332176
Negative
Precision: 0.9921661969336562 || Recall: 0.4998815393727741 || F1: 0.6648119276191412
[[642809 657203]
 [  5059  17790]]
Positive
Precision: 0.02635582887526241 || Recall: 0.7785898726421288 || F1: 0.050985753222076054
Negative
Precision: 0.992191310575611 || Recall: 0.494463897256333 || F1: 0.660008830112738
---------------------------
Positive
Overall Precision: 0.02641396817914628 (+/- 5.8952163675413935e-05) || Overall Recall: 0.7767107084862301 (+/- 0.0030589796330708218) || Overall F1: 0.05109043910274166 (+/- 0.00010910188169340433)
Negative
Overall Precision: 0.9921645284285215 (+/- 7.35729980729681e-05) || Overall

[[650869 649141]
 [  5165  17681]]
Positive
Precision: 0.026515321929990312 || Recall: 0.7739210365052963 || F1: 0.05127394630459873
Negative
Precision: 0.9921269324455745 || Recall: 0.5006646102722286 || F1: 0.6654952547079718
[[644227 655785]
 [  5093  17756]]
Positive
Precision: 0.026362166519929744 || Recall: 0.7771018425314018 || F1: 0.050994414049598646
Negative
Precision: 0.9921564097825417 || Recall: 0.4955546564185561 || F1: 0.6609720663283627
---------------------------
Positive
Overall Precision: 0.02637203752042257 (+/- 7.566605423751433e-05) || Overall Recall: 0.7749073684187376 (+/- 0.004451951499400417) || Overall F1: 0.05100806958925239 (+/- 0.00014117607899176822)
Negative
Overall Precision: 0.9921075226469028 (+/- 0.00010980017573225874) || Overall Recall: 0.49721986890726794 (+/- 0.0033433308548007222) || Overall F1: 0.6624339100251794 (+/- 0.0029473592490613485)
[0.15, 3, 3, 0.2, 0.5]
[[640984 659026]
 [  5009  17837]]
Positive
Precision: 0.0263524524165156 || Recal

[[656596 643416]
 [  5292  17557]]
Positive
Precision: 0.0265623557997074 || Recall: 0.7683924898244999 || F1: 0.051349620222806515
Negative
Precision: 0.9920046896151615 || Recall: 0.5050691839767633 || F1: 0.669347061521994
---------------------------
Positive
Overall Precision: 0.026465278150675342 (+/- 0.00011034244989125697) || Overall Recall: 0.7722638083036901 (+/- 0.0032374643031537445) || Overall F1: 0.05117664685461133 (+/- 0.00020160651593943677)
Negative
Overall Precision: 0.9920709540595638 (+/- 6.175479826387916e-05) || Overall Recall: 0.5007381466083541 (+/- 0.0038711959044824464) || Overall F1: 0.6655386359184012 (+/- 0.0034168592828480665)
[0.15, 3, 5, 0, 0.3]
[[640507 659503]
 [  5000  17846]]
Positive
Precision: 0.026346831544742814 || Recall: 0.7811433073623392 || F1: 0.050974371425103006
Negative
Precision: 0.9922541506133938 || Recall: 0.49269390235459726 || F1: 0.6584440022883378
[[642854 657156]
 [  5090  17756]]
Positive
Precision: 0.026308615049073066 || Recal

[[643797 656213]
 [  5108  17738]]
Positive
Precision: 0.026319420848103202 || Recall: 0.7764160028013657 || F1: 0.05091296317291837
Negative
Precision: 0.9921282776369422 || Recall: 0.4952246521180606 || F1: 0.6606722201840511
[[650188 649822]
 [  5215  17631]]
Positive
Precision: 0.026415343102810237 || Recall: 0.7717324695789197 || F1: 0.05108221220079994
Negative
Precision: 0.9920430635807282 || Recall: 0.5001407681479373 || F1: 0.6650134779711497
[[653772 646238]
 [  5303  17543]]
Positive
Precision: 0.0264288974827541 || Recall: 0.7678805917884969 || F1: 0.0510990683442393
Negative
Precision: 0.9919538747487008 || Recall: 0.502897670017923 || F1: 0.6674258646255777
[[654232 645778]
 [  5215  17631]]
Positive
Precision: 0.026576365409573883 || Recall: 0.7717324695789197 || F1: 0.0513832321804577
Negative
Precision: 0.992091858784709 || Recall: 0.5032515134498965 || F1: 0.6677686726475752
[[649204 650808]
 [  5196  17653]]
Positive
Precision: 0.026408421732905883 || Recall: 0.77259

[[662878 637132]
 [  5308  17538]]
Positive
Precision: 0.02678906930209113 || Recall: 0.7676617350958592 || F1: 0.05177147107964977
Negative
Precision: 0.992056104138668 || Recall: 0.509902231521296 || F1: 0.6735894189399837
[[665559 634451]
 [  5286  17560]]
Positive
Precision: 0.02693206096216168 || Recall: 0.768624704543465 || F1: 0.05204065453866522
Negative
Precision: 0.9921203854839792 || Recall: 0.5119645233498203 || F1: 0.6754012852289996
[[668344 631668]
 [  5423  17426]]
Positive
Precision: 0.02684665087028997 || Recall: 0.7626591973390521 || F1: 0.05186749471309323
Negative
Precision: 0.9919512234941753 || Recall: 0.5141060236367049 || F1: 0.6772227285830885
---------------------------
Positive
Overall Precision: 0.026811221186929475 (+/- 8.718390655526087e-05) || Overall Recall: 0.7667750330246693 (+/- 0.0036890120009956855) || Overall F1: 0.0518107312294663 (+/- 0.00015885825490853203)
Negative
Overall Precision: 0.9920411005394852 (+/- 7.939890652003223e-05) || Overall Re

[[665335 634675]
 [  5300  17546]]
Positive
Precision: 0.02690192434772876 || Recall: 0.7680119058040795 || F1: 0.05198298835522992
Negative
Precision: 0.9920970423553797 || Recall: 0.5117922169829463 || F1: 0.6752459220204553
[[667304 632708]
 [  5434  17415]]
Positive
Precision: 0.02678723872251866 || Recall: 0.7621777758326403 || F1: 0.051755496513970864
Negative
Precision: 0.9919225612348344 || Recall: 0.5133060310212522 || F1: 0.6765216068939298
---------------------------
Positive
Overall Precision: 0.026781163421724662 (+/- 9.788536806367767e-05) || Overall Recall: 0.7657858134174254 (+/- 0.004302701281985276) || Overall F1: 0.05175231961759249 (+/- 0.00017756721401918896)
Negative
Overall Precision: 0.9920087789708892 (+/- 9.067816189193053e-05) || Overall Recall: 0.5109263733952721 (+/- 0.004085702628766467) || Overall F1: 0.6744615877287757 (+/- 0.0035567140455216896)
[0.15, 8, 1, 0.4, 0.5]
[[656015 643995]
 [  5223  17623]]
Positive
Precision: 0.0266362160642546 || Recall: 0

[[661582 638428]
 [  5321  17525]]
Positive
Precision: 0.026716853189176663 || Recall: 0.7670927076950013 || F1: 0.051635314724977496
Negative
Precision: 0.9920213284390683 || Recall: 0.508905316112953 || F1: 0.6727109943347774
[[665693 634317]
 [  5413  17433]]
Positive
Precision: 0.026747986191024167 || Recall: 0.7630657445504684 || F1: 0.05168426732444308
Negative
Precision: 0.9919342100949775 || Recall: 0.512067599480004 || F1: 0.6754478173785816
[[664919 635091]
 [  5379  17467]]
Positive
Precision: 0.026766969372837356 || Recall: 0.7645539700604045 || F1: 0.051723116830815326
Negative
Precision: 0.9919752110255439 || Recall: 0.5114722194444659 || F1: 0.6749391465699778
[[665077 634933]
 [  5272  17574]]
Positive
Precision: 0.02693304439645858 || Recall: 0.7692375032828503 || F1: 0.052043894082057826
Negative
Precision: 0.9921354398977249 || Recall: 0.5115937569711002 || F1: 0.6750820535750084
[[666792 633220]
 [  5427  17422]]
Positive
Precision: 0.02677662985174643 || Recall: 0.

[[668732 631278]
 [  5451  17395]]
Positive
Precision: 0.02681628493863626 || Recall: 0.7614024336864221 || F1: 0.051807916082791394
Negative
Precision: 0.9919146581862788 || Recall: 0.5144052738055861 || F1: 0.6774737829584038
[[664731 635279]
 [  5336  17510]]
Positive
Precision: 0.02682336865357719 || Recall: 0.7664361376170883 || F1: 0.05183272032976385
Negative
Precision: 0.992036617233799 || Recall: 0.5113276051722679 || F1: 0.6748274306029663
[[665930 634080]
 [  5289  17557]]
Positive
Precision: 0.026942914536774307 || Recall: 0.7684933905278823 || F1: 0.05206061531573071
Negative
Precision: 0.9921203064871524 || Recall: 0.5122499057699557 || F1: 0.6756495566978773
[[667223 632789]
 [  5391  17458]]
Positive
Precision: 0.02684825919996555 || Recall: 0.764059696266795 || F1: 0.051873729750288214
Negative
Precision: 0.9919850017989515 || Recall: 0.513243723904087 || F1: 0.676482009260752
---------------------------
Positive
Overall Precision: 0.026826046290144418 (+/- 7.783610083

[[663709 636301]
 [  5241  17605]]
Positive
Precision: 0.026922829886864473 || Recall: 0.7705944147772039 || F1: 0.05202792160200487
Negative
Precision: 0.992165333731968 || Recall: 0.5105414573734048 || F1: 0.674172151795872
[[668969 631043]
 [  5433  17416]]
Positive
Precision: 0.026857519133823417 || Recall: 0.7622215414241323 || F1: 0.05188676434661884
Negative
Precision: 0.9919439740688788 || Recall: 0.5145867884296452 || F1: 0.6776380232311967
---------------------------
Positive
Overall Precision: 0.026811288958070695 (+/- 9.389615442767827e-05) || Overall Recall: 0.7663898567396983 (+/- 0.004095862255050453) || Overall F1: 0.05180996840112684 (+/- 0.00017209998926351894)
Negative
Overall Precision: 0.9920319500597744 (+/- 9.182428787969292e-05) || Overall Recall: 0.511108525446177 (+/- 0.003662518342293596) || Overall F1: 0.6746275882342838 (+/- 0.0031833570075598086)
[0.15, 8, 5, 0.2, 0.7]
[[663225 636785]
 [  5344  17502]]
Positive
Precision: 0.02674972909441881 || Recall: 0.

[[670454 629558]
 [  5512  17337]]
Positive
Precision: 0.026800330811028065 || Recall: 0.7587640596962668 || F1: 0.05177202035404572
Negative
Precision: 0.991845743720838 || Recall: 0.5157290855776716 || F1: 0.678604721307626
---------------------------
Positive
Overall Precision: 0.02673173414680962 (+/- 0.0001086710893502713) || Overall Recall: 0.7636498617510366 (+/- 0.004562595454777037) || Overall F1: 0.0516551397970917 (+/- 0.00019851843535507449)
Negative
Overall Precision: 0.9919433417176361 (+/- 9.910739806268738e-05) || Overall Recall: 0.511362676985843 (+/- 0.004247439531621655) || Overall F1: 0.674825724013195 (+/- 0.003691283587898536)
[0.15, 12, 1, 0, 0.5]
[[658989 641021]
 [  5276  17570]]
Positive
Precision: 0.026678165963397615 || Recall: 0.7690624179287403 || F1: 0.051567496334950995
Negative
Precision: 0.9920573867357154 || Recall: 0.5069107160714148 || F1: 0.6709742780415165
[[668994 631016]
 [  5523  17323]]
Positive
Precision: 0.026719046671571507 || Recall: 0.758

[[655986 644024]
 [  5248  17598]]
Positive
Precision: 0.02659826910229708 || Recall: 0.7702880154075111 || F1: 0.051420957590420584
Negative
Precision: 0.9920633240275001 || Recall: 0.504600733840509 || F1: 0.6689488916218481
[[666388 633622]
 [  5484  17362]]
Positive
Precision: 0.026670394356850553 || Recall: 0.7599579795150135 || F1: 0.05153228559132125
Negative
Precision: 0.9918377309963803 || Recall: 0.512602210752225 || F1: 0.675890342322715
[[668567 631443]
 [  5446  17400]]
Positive
Precision: 0.0268169649668718 || Recall: 0.7616212903790598 || F1: 0.051809691687670925
Negative
Precision: 0.9919200371506187 || Recall: 0.5142783517049869 || F1: 0.6773649547143068
[[665706 634304]
 [  5331  17515]]
Positive
Precision: 0.026870956507864915 || Recall: 0.766654994309726 || F1: 0.0519220650248642
Negative
Precision: 0.99205557964762 || Recall: 0.5120775994030815 || F1: 0.6754846535876619
[[671515 628497]
 [  5552  17297]]
Positive
Precision: 0.026784082849949056 || Recall: 0.7570134

[[667909 632101]
 [  5453  17393]]
Positive
Precision: 0.026779308199921786 || Recall: 0.7613148910093671 || F1: 0.051738703632090904
Negative
Precision: 0.9919018299220924 || Recall: 0.5137722017522942 || F1: 0.6769215332942801
[[655804 644206]
 [  5155  17691]]
Positive
Precision: 0.02672772349776476 || Recall: 0.7743587498905716 || F1: 0.05167194115164376
Negative
Precision: 0.9922007265201018 || Recall: 0.5044607349174237 || F1: 0.6688570803516017
[[673844 626168]
 [  5533  17316]]
Positive
Precision: 0.026909759994032487 || Recall: 0.7578449822749355 || F1: 0.05197401299350325
Negative
Precision: 0.9918557737456523 || Recall: 0.5183367538145802 || F1: 0.6808606090061126
---------------------------
Positive
Overall Precision: 0.026764637414966708 (+/- 8.043313718489508e-05) || Overall Recall: 0.7640000565968064 (+/- 0.006053500743522003) || Overall F1: 0.05171732512747281 (+/- 0.00014212527862613554)
Negative
Overall Precision: 0.991961928370453 (+/- 0.00013264410593190868) || Over

[[665065 634945]
 [  5316  17530]]
Positive
Precision: 0.02686692976742404 || Recall: 0.7673115643876389 || F1: 0.05191605177389716
Negative
Precision: 0.992070180986633 || Recall: 0.5115845262728749 || F1: 0.6750589096275815
[[670792 629220]
 [  5489  17360]]
Positive
Precision: 0.02684895913885366 || Recall: 0.759770668300582 || F1: 0.05186509697070189
Negative
Precision: 0.991883551364004 || Recall: 0.5159890831776938 || F1: 0.6788386135051836
---------------------------
Positive
Overall Precision: 0.026757428307512144 (+/- 9.45177396152429e-05) || Overall Recall: 0.7647528730455668 (+/- 0.0035417868506310095) || Overall F1: 0.051705671538660035 (+/- 0.00017231990289848426)
Negative
Overall Precision: 0.9919769775137443 (+/- 7.398083444378908e-05) || Overall Recall: 0.5111428324431094 (+/- 0.003537087282109249) || Overall F1: 0.6746453295182067 (+/- 0.0030727454031156867)
[0.15, 12, 3, 0.4, 0.7]
[[660063 639947]
 [  5322  17524]]
Positive
Precision: 0.02665364708101194 || Recall: 0.

[[671116 628896]
 [  5523  17326]]
Positive
Precision: 0.026811219673734413 || Recall: 0.7582826381898551 || F1: 0.05179121498316322
Negative
Precision: 0.9918375973007764 || Recall: 0.516238311646354 || F1: 0.6790434932620882
---------------------------
Positive
Overall Precision: 0.02678174704923878 (+/- 8.645496545421651e-05) || Overall Recall: 0.7624768025219769 (+/- 0.002535551750272017) || Overall F1: 0.05174590477228176 (+/- 0.00016247087417653033)
Negative
Overall Precision: 0.9919297031572724 (+/- 6.948897814406223e-05) || Overall Recall: 0.5130588176280761 (+/- 0.0020680334752259653) || Overall F1: 0.6763060248218039 (+/- 0.0017908059443832154)
[0.15, 12, 5, 0.2, 0.5]
[[656941 643069]
 [  5269  17577]]
Positive
Precision: 0.02660577677000996 || Recall: 0.769368817298433 || F1: 0.05143293557203303
Negative
Precision: 0.9920433095241691 || Recall: 0.5053353435742802 || F1: 0.6695895465340278
[[666018 633992]
 [  5433  17413]]
Positive
Precision: 0.026731449712544424 || Recall: 

[[656605 643405]
 [  5299  17547]]
Positive
Precision: 0.026548070056524528 || Recall: 0.768055677142607 || F1: 0.05132217409234891
Negative
Precision: 0.9919943073315768 || Recall: 0.505076884023969 || F1: 0.6693514598499222
[[661275 638735]
 [  5384  17462]]
Positive
Precision: 0.026610911052625964 || Recall: 0.7643351133677668 || F1: 0.05143120538758224
Negative
Precision: 0.991923907124932 || Recall: 0.5086691640833532 || F1: 0.6724822529871575
[[656358 643652]
 [  5287  17559]]
Positive
Precision: 0.026555819549281545 || Recall: 0.7685809332049374 || F1: 0.05133782711089865
Negative
Precision: 0.9920093101285432 || Recall: 0.5048868854854963 || F1: 0.6691880070654626
[[657087 642923]
 [  5214  17632]]
Positive
Precision: 0.026692705376539425 || Recall: 0.7717762409174472 || F1: 0.05160074392633315
Negative
Precision: 0.992127446583955 || Recall: 0.5054476504026892 || F1: 0.66970729919977
[[657514 642498]
 [  5280  17569]]
Positive
Precision: 0.02661699494142261 || Recall: 0.768917

[[654800 645210]
 [  5235  17611]]
Positive
Precision: 0.02656976770500633 || Recall: 0.7708570428083691 || F1: 0.05136895898446331
Negative
Precision: 0.9920686024225988 || Recall: 0.5036884331658987 || F1: 0.6681479251751873
[[658222 641788]
 [  5287  17559]]
Positive
Precision: 0.026630893899570334 || Recall: 0.7685809332049374 || F1: 0.05147810077206891
Negative
Precision: 0.9920317584237742 || Recall: 0.5063207206098415 || F1: 0.6704513681813112
[[656869 643141]
 [  5215  17631]]
Positive
Precision: 0.026682426010787384 || Recall: 0.7717324695789197 || F1: 0.051581438756732566
Negative
Precision: 0.992123355948792 || Recall: 0.5052799593849278 || F1: 0.6695591546582376
[[660766 639246]
 [  5316  17533]]
Positive
Precision: 0.026695433319274823 || Recall: 0.7673421156286927 || F1: 0.05159587303642581
Negative
Precision: 0.992019000663582 || Recall: 0.5082768466752614 || F1: 0.6721611479410444
---------------------------
Positive
Overall Precision: 0.026617064764735677 (+/- 7.071019

[[653073 646937]
 [  5218  17628]]
Positive
Precision: 0.026525622023428858 || Recall: 0.7716011555633371 || F1: 0.05128809402235344
Negative
Precision: 0.9920734143410741 || Recall: 0.5023599818462935 || F1: 0.6669791824647998
[[666244 633766]
 [  5391  17455]]
Positive
Precision: 0.026803496815981057 || Recall: 0.764028713998074 || F1: 0.05179010395109092
Negative
Precision: 0.991973318841335 || Recall: 0.5124914423735202 || F1: 0.6758255162567298
[[661338 638674]
 [  5314  17535]]
Positive
Precision: 0.026721669468111532 || Recall: 0.7674296468116767 || F1: 0.051645073027635345
Negative
Precision: 0.992028824634142 || Recall: 0.5087168426137605 || F1: 0.6725480305736007
---------------------------
Positive
Overall Precision: 0.026648993359371105 (+/- 0.00010702197038965022) || Overall Recall: 0.7694537136571791 (+/- 0.0043925965036874185) || Overall F1: 0.0515137237705167 (+/- 0.0001912005351159779)
Negative
Overall Precision: 0.9920582195872122 (+/- 7.97165481174343e-05) || Overall

[[658154 641856]
 [  5235  17611]]
Positive
Precision: 0.02670489956282877 || Recall: 0.7708570428083691 || F1: 0.051621469911902605
Negative
Precision: 0.9921087024355243 || Recall: 0.5062684133198976 || F1: 0.6704230775303441
[[662955 637057]
 [  5351  17498]]
Positive
Precision: 0.026732665704180703 || Recall: 0.7658103199264739 || F1: 0.0516619329085745
Negative
Precision: 0.9919931887488666 || Recall: 0.5099606772860559 || F1: 0.673625908008767
---------------------------
Positive
Overall Precision: 0.026621943481547606 (+/- 7.976701154520727e-05) || Overall Recall: 0.7678167081243126 (+/- 0.0018073266219298478) || Overall F1: 0.05145964241727115 (+/- 0.00014953075410072705)
Negative
Overall Precision: 0.9920102258326192 (+/- 5.2133108994119716e-05) || Overall Recall: 0.5066244075166569 (+/- 0.0017734828206045553) || Overall F1: 0.6707107924539472 (+/- 0.0015507743392015002)
[0.25, 3, 3, 0.4, 0.5]
[[649157 650853]
 [  5110  17736]]
Positive
Precision: 0.026527507930881305 || Recal

[[650652 649360]
 [  5144  17705]]
Positive
Precision: 0.026541641369281853 || Recall: 0.7748697973653114 || F1: 0.051325237638314335
Negative
Precision: 0.992156097322948 || Recall: 0.5004969184899831 || F1: 0.6653536543464388
---------------------------
Positive
Overall Precision: 0.02654352983584654 (+/- 8.526131172553727e-05) || Overall Recall: 0.7740232460002442 (+/- 0.0020237852074241466) || Overall F1: 0.051326856674209755 (+/- 0.0001554647258199939)
Negative
Overall Precision: 0.9921375158448285 (+/- 2.9528064722486484e-05) || Overall Recall: 0.5011218373714299 (+/- 0.002853926185367722) || Overall F1: 0.665896591195241 (+/- 0.0025102745361182516)
[0.25, 3, 5, 0.2, 0.3]
[[652507 647503]
 [  5230  17616]]
Positive
Precision: 0.02648548605587872 || Recall: 0.7710758995010067 || F1: 0.05121190758250782
Negative
Precision: 0.9920484935468128 || Recall: 0.5019246005799956 || F1: 0.6665897074545383
[[659300 640710]
 [  5350  17496]]
Positive
Precision: 0.026581343834604972 || Recall:

[[648327 651683]
 [  5122  17724]]
Positive
Precision: 0.026477165610756984 || Recall: 0.7758032040619802 || F1: 0.05120671199691442
Negative
Precision: 0.9921615917998191 || Recall: 0.4987092406981485 || F1: 0.6637733374491095
[[650763 649247]
 [  5179  17667]]
Positive
Precision: 0.02649067196070258 || Recall: 0.7733082377659108 || F1: 0.051226513569937376
Negative
Precision: 0.9921044848477457 || Recall: 0.5005830724379043 || F1: 0.6654181697710373
[[650622 649388]
 [  5145  17701]]
Positive
Precision: 0.026534690273711605 || Recall: 0.774796463275847 || F1: 0.051312080123489895
Negative
Precision: 0.9921542255099753 || Recall: 0.5004746117337558 || F1: 0.6653335221755855
[[658306 641704]
 [  5242  17604]]
Positive
Precision: 0.026700722575791588 || Recall: 0.7705506434386764 || F1: 0.05161297888746529
Negative
Precision: 0.9921000440058594 || Recall: 0.5063853354974193 || F1: 0.6705236107107608
[[654058 645954]
 [  5195  17654]]
Positive
Precision: 0.026603054815493483 || Recall: 0

[[666305 633705]
 [  5425  17421]]
Positive
Precision: 0.026755190239677113 || Recall: 0.7625404884881379 || F1: 0.051696509647285054
Negative
Precision: 0.991923838446995 || Recall: 0.5125383650894993 || F1: 0.6758548287299542
[[669865 630145]
 [  5483  17363]]
Positive
Precision: 0.026815112709032165 || Recall: 0.760001750853541 || F1: 0.051802480480462564
Negative
Precision: 0.9918812227177692 || Recall: 0.5152768055630341 || F1: 0.6782213654436309
[[665478 634532]
 [  5327  17519]]
Positive
Precision: 0.026867530300543976 || Recall: 0.7668300796638361 || F1: 0.051916070155890455
Negative
Precision: 0.9920587950298522 || Recall: 0.511902216136799 || F1: 0.6753327937934307
[[666911 633101]
 [  5427  17422]]
Positive
Precision: 0.026781528093549344 || Recall: 0.7624841349730842 || F1: 0.05174554332523478
Negative
Precision: 0.9919281670826281 || Recall: 0.5130037261194512 || F1: 0.6762602986285394
---------------------------
Positive
Overall Precision: 0.026762225689145484 (+/- 9.3128

[[668443 631567]
 [  5449  17397]]
Positive
Precision: 0.026807342163818023 || Recall: 0.7614899763634772 || F1: 0.051791429124305975
Negative
Precision: 0.9919141346091065 || Recall: 0.5141829678233244 || F1: 0.6772808376505013
[[665474 634536]
 [  5308  17538]]
Positive
Precision: 0.026895720424369014 || Recall: 0.7676617350958592 || F1: 0.05197060392342796
Negative
Precision: 0.9920868478879874 || Recall: 0.5118991392373905 || F1: 0.6753366159391757
[[669107 630905]
 [  5472  17377]]
Positive
Precision: 0.02680469301939588 || Recall: 0.7605146833559455 || F1: 0.05178422692440075
Negative
Precision: 0.9918882740197961 || Recall: 0.5146929412959265 || F1: 0.6777170563423008
---------------------------
Positive
Overall Precision: 0.026781851244962646 (+/- 7.564838945906292e-05) || Overall Recall: 0.7637811297903347 (+/- 0.002558520456051408) || Overall F1: 0.05174910085132507 (+/- 0.00014226937382851452)
Negative
Overall Precision: 0.9919607907467325 (+/- 6.835298590112251e-05) || Over

[[662362 637648]
 [  5236  17610]]
Positive
Precision: 0.02687491034065971 || Recall: 0.7708132714698416 || F1: 0.0519389356204948
Negative
Precision: 0.9921569567314462 || Recall: 0.5095053114976039 || F1: 0.6732662196941669
[[670766 629246]
 [  5477  17372]]
Positive
Precision: 0.026865939395439038 || Recall: 0.7602958553984858 || F1: 0.05189800244074764
Negative
Precision: 0.9919008403783847 || Recall: 0.5159690833623074 || F1: 0.6788253540155496
---------------------------
Positive
Overall Precision: 0.02677416034121593 (+/- 9.56752912563302e-05) || Overall Recall: 0.7650680128900798 (+/- 0.0045306183279265186) || Overall F1: 0.05173762368404452 (+/- 0.0001769653909125019)
Negative
Overall Precision: 0.9919896167751799 (+/- 0.0001067833033323026) || Overall Recall: 0.511257600797199 (+/- 0.003680159214576808) || Overall F1: 0.6747475621267246 (+/- 0.0031907686715693085)
[0.25, 8, 3, 0.4, 0.3]
[[660951 639059]
 [  5358  17488]]
Positive
Precision: 0.02663632611222045 || Recall: 0.76

[[671678 628334]
 [  5486  17363]]
Positive
Precision: 0.026890321621441636 || Recall: 0.759901965075058 || F1: 0.05194257388422038
Negative
Precision: 0.9918985651924792 || Recall: 0.516670615348166 || F1: 0.679431674266732
---------------------------
Positive
Overall Precision: 0.026801055143180673 (+/- 9.227214569691967e-05) || Overall Recall: 0.7641313165902546 (+/- 0.003395662401886394) || Overall F1: 0.05178569949387235 (+/- 0.00016809681994038543)
Negative
Overall Precision: 0.9919748381728389 (+/- 7.027173524668064e-05) || Overall Recall: 0.5123575921198713 (+/- 0.003424556802316301) || Overall F1: 0.6757025855568105 (+/- 0.0029689863842760804)
[0.25, 8, 5, 0, 0.7]
[[658129 641881]
 [  5280  17566]]
Positive
Precision: 0.026637470486635013 || Recall: 0.7688873325746302 || F1: 0.051491074948739037
Negative
Precision: 0.9920411088785349 || Recall: 0.5062491826985946 || F1: 0.6703907826093157
[[665352 634658]
 [  5431  17415]]
Positive
Precision: 0.02670713248363297 || Recall: 0.7

[[659973 640037]
 [  5324  17522]]
Positive
Precision: 0.02664703851669584 || Recall: 0.7669613936794187 || F1: 0.05150461857276181
Negative
Precision: 0.9919975589849346 || Recall: 0.5076676333258975 || F1: 0.6716233138130583
[[666392 633618]
 [  5430  17416]]
Positive
Precision: 0.02675129102320309 || Recall: 0.7623216317955003 || F1: 0.051688727963435634
Negative
Precision: 0.9919175019573637 || Recall: 0.5126052876516335 || F1: 0.6759115381026375
[[668848 631162]
 [  5428  17418]]
Positive
Precision: 0.02685559221684295 || Recall: 0.7624091744725554 || F1: 0.05188360295847942
Negative
Precision: 0.9919498840237528 || Recall: 0.5144945038884317 || F1: 0.6775593809610159
[[665262 634748]
 [  5311  17535]]
Positive
Precision: 0.02688250345325572 || Recall: 0.7675304210802766 || F1: 0.051945628168838846
Negative
Precision: 0.9920799077803609 || Recall: 0.5117360635687418 || F1: 0.6751930773786236
[[662134 637878]
 [  5399  17450]]
Positive
Precision: 0.026627887103862494 || Recall: 0.7

[[663782 636228]
 [  5474  17372]]
Positive
Precision: 0.026578947368421053 || Recall: 0.7603956929002889 || F1: 0.05136256256966558
Negative
Precision: 0.9918207681365576 || Recall: 0.5105976107876093 || F1: 0.6741415329366374
[[667199 632811]
 [  5493  17353]]
Positive
Precision: 0.02669018893694514 || Recall: 0.7595640374682657 || F1: 0.051568327365120875
Negative
Precision: 0.9918343015822991 || Recall: 0.5132260521072914 || F1: 0.6764316151147005
[[664721 635289]
 [  5399  17447]]
Positive
Precision: 0.02672902980684381 || Recall: 0.7636785432898539 || F1: 0.051650280794929414
Negative
Precision: 0.9919432340476333 || Recall: 0.5113199129237468 || F1: 0.6747991249308423
[[669066 630946]
 [  5549  17300]]
Positive
Precision: 0.026687399536595675 || Recall: 0.757144732811064 || F1: 0.051557529112867775
Negative
Precision: 0.9917745677164012 || Recall: 0.5146614031255096 || F1: 0.6776631738551129
---------------------------
Positive
Overall Precision: 0.026632498760186786 (+/- 9.2445

[[666903 633107]
 [  5502  17344]]
Positive
Precision: 0.02666457580970742 || Recall: 0.759170095421518 || F1: 0.051519611701819554
Negative
Precision: 0.991817431458719 || Recall: 0.512998361551065 || F1: 0.6762299009082775
[[653616 646394]
 [  5196  17650]]
Positive
Precision: 0.026579564004794862 || Recall: 0.7725641250109428 || F1: 0.05139105242469682
Negative
Precision: 0.9921130762645489 || Recall: 0.5027776709409928 || F1: 0.667356196734568
[[663420 636592]
 [  5464  17385]]
Positive
Precision: 0.026583503701200502 || Recall: 0.7608648080878813 || F1: 0.05137213995916232
Negative
Precision: 0.9918311695301427 || Recall: 0.5103183662920034 || F1: 0.6739005005850994
---------------------------
Positive
Overall Precision: 0.026584350507610634 (+/- 4.472669857748177e-05) || Overall Recall: 0.7629494651630547 (+/- 0.00527963105976708) || Overall F1: 0.051378365208926594 (+/- 8.09226317787043e-05)
Negative
Overall Precision: 0.9918833477511406 (+/- 0.00012348818076503908) || Overall R

[[656899 643111]
 [  5228  17618]]
Positive
Precision: 0.026664487255743278 || Recall: 0.7711634421780618 || F1: 0.05154664813663461
Negative
Precision: 0.9921042337799244 || Recall: 0.5053030361304913 || F1: 0.6695750602531831
[[662924 637088]
 [  5435  17414]]
Positive
Precision: 0.02660648859743744 || Recall: 0.7621340102411485 || F1: 0.05141795022078657
Negative
Precision: 0.991868142719706 || Recall: 0.5099368313523259 || F1: 0.6735762719527976
---------------------------
Positive
Overall Precision: 0.026624346896328355 (+/- 3.173875768913897e-05) || Overall Recall: 0.7641137494350808 (+/- 0.0041329899069978936) || Overall F1: 0.05145574907418761 (+/- 6.13434154354916e-05)
Negative
Overall Precision: 0.9919227249386328 (+/- 0.0001010345650386144) || Overall Recall: 0.5090531580720667 (+/- 0.002628456855830374) || Overall F1: 0.6728132926729797 (+/- 0.00227516578766379)
[0.25, 12, 3, 0.2, 0.7]
[[656240 643770]
 [  5327  17519]]
Positive
Precision: 0.026492199325862067 || Recall: 0.

[[670284 629728]
 [  5537  17312]]
Positive
Precision: 0.02675568743818002 || Recall: 0.7576699199089676 || F1: 0.051686174873747735
Negative
Precision: 0.9918070021499776 || Recall: 0.5155983175539918 || F1: 0.6784824425950979
---------------------------
Positive
Overall Precision: 0.026698785287997056 (+/- 0.00011295496916082209) || Overall Recall: 0.7631421429592951 (+/- 0.004620852240411967) || Overall F1: 0.05159245888557856 (+/- 0.00020682924683148915)
Negative
Overall Precision: 0.9919215497950681 (+/- 0.00010100251724203683) || Overall Recall: 0.5110677562177776 (+/- 0.004309579186833867) || Overall F1: 0.6745635104246768 (+/- 0.003746456566307478)
[0.25, 12, 5, 0, 0.5]
[[660594 639416]
 [  5348  17498]]
Positive
Precision: 0.026636667813442854 || Recall: 0.765910881554758 || F1: 0.05148287630928562
Negative
Precision: 0.9919692705971391 || Recall: 0.5081453219590618 || F1: 0.6720347190572302
[[665880 634130]
 [  5496  17350]]
Positive
Precision: 0.026631669429606434 || Recall:

[[654913 645097]
 [  5255  17591]]
Positive
Precision: 0.026544920083055677 || Recall: 0.7699816160378185 || F1: 0.051320576368203474
Negative
Precision: 0.9920399049938804 || Recall: 0.5037753555741878 || F1: 0.6682178863348124
[[665325 634685]
 [  5469  17377]]
Positive
Precision: 0.026649306354303733 || Recall: 0.7606145495929265 || F1: 0.05149442590693843
Negative
Precision: 0.9918469753754506 || Recall: 0.5117845247344252 || F1: 0.6751812965672893
[[668758 631252]
 [  5468  17378]]
Positive
Precision: 0.026791853599124307 || Recall: 0.7606583209314541 || F1: 0.051760599038536
Negative
Precision: 0.9918899597464351 || Recall: 0.5144252736517412 || F1: 0.6774853664911389
[[665161 634849]
 [  5352  17494]]
Positive
Precision: 0.026817180532327317 || Recall: 0.7657357962006478 || F1: 0.051819564595987196
Negative
Precision: 0.9920180518498523 || Recall: 0.511658371858678 || F1: 0.6751111253205366
[[670676 629336]
 [  5547  17302]]
Positive
Precision: 0.026756856231771098 || Recall: 0.

In [42]:
# get max f1 
max_f1

0.05187613877550279

In [43]:
best_parameters

[0.15, 8, 3, 0.2, 0.7]