In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score, recall_score, precision_score
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold # for cross validation
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics # for f1 macro in cross validation

import xgboost as xgb

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
from helper import util_visualizations, util_ml

# 0. Load Data

## 0.1 Training

In [2]:
# read data
train = pd.read_csv('path/to/9_FINAL/data/machine_learning/two_class/count-based/train/downsampled_count_train.csv', sep=";")
del train['Unnamed: 0']

In [3]:
# get label
y_train = train['label']

In [4]:
##### get variables
X_train = train.copy()
del X_train['label']
X_train.shape

(228466, 65)

In [5]:
X_train

Unnamed: 0,instance,class,frequency,pidspread,pldspread,id,pids,p1,p10,p11,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,espresso,shop,50,14,40,381646096,"['p5', 'p23bp3a', 'p8a', 'p3a', 'p12a', 'p21d'...",2,0,0,...,0,0,0,6,0,0,16,2,1,0
1,right,vehicle,139,22,101,225853190,"['p21bp5p2p1p25p21dp11p20bp8ap27bp3ap43', 'p15...",9,0,1,...,4,0,3,7,0,0,25,3,0,1
2,permaculture,person,2,1,2,81603917,['p5'],0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,silvermont,microprocessor,1,1,1,41005554,['p8a'],0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,iniparib,agent,8,1,7,360028802,"['p8a', 'p8a', 'p8a']",0,0,0,...,0,0,0,0,0,0,3,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228461,antithesis,book,1,1,1,506104028,['p8a'],0,0,0,...,0,0,0,0,0,0,1,0,0,0
228462,gaberlunzie,beggar,2,2,2,144824586,"['p28a', 'p8b']",0,0,0,...,0,0,0,0,0,0,0,1,0,0
228463,cantal,department,1,1,1,19490373,['p8a'],0,0,0,...,0,0,0,0,0,0,1,0,0,0
228464,whipple,deacon,1,1,1,267942366,['p8b'],0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
# only take top 10 patterns
X_train_top_10 = X_train[['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10']]
X_train_top_10.head()

Unnamed: 0,p8a,p5,p1,p3a,p8b,p4,p2,p8c,p20a,p10
0,16,6,2,6,2,0,1,1,1,0
1,25,7,9,13,3,4,5,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0


In [7]:
# only take one hot encoded columns
X_train_one_hot = X_train.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_train_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,2,0,0,2,0,0,0,0,0,0,...,0,0,0,6,0,0,16,2,1,0
1,9,0,1,0,0,0,0,0,1,0,...,4,0,3,7,0,0,25,3,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,0,0


In [8]:
# scale count 
min_max_scaler = preprocessing.MinMaxScaler()
X_train_count_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot))
X_train_count_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.000276,0.0,0.0,0.007663,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001195,0.0,0.0,0.00123,0.000456,0.001272,0.0
1,0.001244,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.001095,0.0,...,0.002274,0.0,0.01049,0.001394,0.0,0.0,0.001923,0.000683,0.0,0.010526
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000199,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.7e-05,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000231,0.0,0.0,0.0


In [9]:
# only take one hot encoded columns
X_train_one_hot_more = X_train.drop(['instance', 'class', 'id', 'pids'], axis=1) 
X_train_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,50,14,40,2,0,0,2,0,0,0,...,0,0,0,6,0,0,16,2,1,0
1,139,22,101,9,0,1,0,0,0,0,...,4,0,3,7,0,0,25,3,0,1
2,2,1,2,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,8,1,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,0,0


In [10]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_train_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot_more))
X_train_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.000756,0.26,0.002004,0.000276,0.0,0.0,0.007663,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001195,0.0,0.0,0.00123,0.000456,0.001272,0.0
1,0.002128,0.42,0.005139,0.001244,0.0,0.030303,0.0,0.0,0.0,0.0,...,0.002274,0.0,0.01049,0.001394,0.0,0.0,0.001923,0.000683,0.0,0.010526
2,1.5e-05,0.0,5.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000199,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.7e-05,0.0,0.0,0.0
4,0.000108,0.0,0.000308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000231,0.0,0.0,0.0


In [11]:
# only take one hot encoded columns
X_train_one_hot_frequency = X_train[['frequency', 'pidspread', 'pldspread']]
X_train_one_hot_frequency.head()

Unnamed: 0,frequency,pidspread,pldspread
0,50,14,40
1,139,22,101
2,2,1,2
3,1,1,1
4,8,1,7


In [12]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_train_one_hot_frequency_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot_frequency))
X_train_one_hot_frequency_scaled.head()

Unnamed: 0,0,1,2
0,0.000756,0.26,0.002004
1,0.002128,0.42,0.005139
2,1.5e-05,0.0,5.1e-05
3,0.0,0.0,0.0
4,0.000108,0.0,0.000308


## 0.2 Testing

In [13]:
# read test data
test = pd.read_csv('path/to/9_FINAL/data/machine_learning/two_class/count-based/test/count_test.csv', sep=";")
del test['Unnamed: 0']

In [14]:
# get label
y_test = test['label']

In [15]:
# get variables
X_test = test.copy()
del X_test['label']
X_test.shape

(348121, 65)

In [16]:
# only take top 10 patterns
X_test_top_10 = X_test[['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10']]
X_test_top_10.head()

Unnamed: 0,p8a,p5,p1,p3a,p8b,p4,p2,p8c,p20a,p10
0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0


In [17]:
# only take one hot encoded columns
X_test_one_hot = X_test.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_test_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [18]:
# only take one hot encoded columns
X_test_one_hot_more = X_test.drop(['instance', 'class',  'id', 'pids'], axis=1)
X_test_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,5,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [19]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_more))
X_test_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000357,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000296,0.0,0.0,0.0
3,6.2e-05,0.0,0.000397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001186,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000344,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# only take one hot encoded columns
X_test_one_hot_frequency = X_test[['frequency', 'pidspread', 'pldspread']]
X_test_one_hot_frequency.head()

Unnamed: 0,frequency,pidspread,pldspread
0,1,1,1
1,1,1,1
2,1,1,1
3,5,1,4
4,1,1,1


In [21]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_frequency_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_frequency))
X_test_one_hot_frequency_scaled.head()

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,6.2e-05,0.0,0.000397
4,0.0,0.0,0.0


# 1. Naive Bayes

## 1.1 Only count columns

In [22]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_one_hot, y_train)

[[17408  5439]
 [12621 10226]]
Positive
Precision: 0.6527928503032238 || Recall: 0.4475861163391255 || F1: 0.5310552555047776
Negative
Precision: 0.5797062839255387 || Recall: 0.7619381100363286 || F1: 0.6584461759588471
[[12005 10842]
 [ 6325 16521]]
Positive
Precision: 0.6037715162811095 || Recall: 0.723146283813359 || F1: 0.6580891871975144
Negative
Precision: 0.65493726132024 || Recall: 0.5254519192891846 || F1: 0.5830925030963887
[[11951 10896]
 [ 6290 16556]]
Positive
Precision: 0.6030890281218126 || Recall: 0.7246782806618226 || F1: 0.6583164340530439
Negative
Precision: 0.6551724137931034 || Recall: 0.5230883704643936 || F1: 0.5817270249221184
[[11795 11051]
 [ 6215 16632]]
Positive
Precision: 0.600801936206336 || Recall: 0.7279730380356283 || F1: 0.6583019988125867
Negative
Precision: 0.6549139367018323 || Recall: 0.516282937932242 || F1: 0.5773937732523987
[[12040 10806]
 [ 6080 16767]]
Positive
Precision: 0.6080948754216081 || Recall: 0.7338819100976058 || F1: 0.665093216977

## 1.2 Only count columns with frequencies

In [23]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_one_hot_more, y_train)

[[21982   865]
 [20181  2666]]
Positive
Precision: 0.7550269045596149 || Recall: 0.11668928086838534 || F1: 0.20213814542421715
Negative
Precision: 0.521357588406897 || Recall: 0.9621394493806626 || F1: 0.6762651899707737
[[21907   940]
 [20087  2759]]
Positive
Precision: 0.7458772641254393 || Recall: 0.12076512299746127 || F1: 0.20787342249011118
Negative
Precision: 0.5216697623470019 || Recall: 0.9588567426795641 || F1: 0.6757144399376938
[[21912   935]
 [19999  2847]]
Positive
Precision: 0.7527763088313062 || Recall: 0.1246170007878841 || F1: 0.21383506083821543
Negative
Precision: 0.5228221707904847 || Recall: 0.9590755897929706 || F1: 0.6767349207819883
[[21925   921]
 [19830  3017]]
Positive
Precision: 0.766124936515998 || Recall: 0.13205234822952686 || F1: 0.22527534067575136
Negative
Precision: 0.5250868159501856 || Recall: 0.9596865972161429 || F1: 0.6787820621971796
[[21932   914]
 [20033  2814]]
Positive
Precision: 0.7548283261802575 || Recall: 0.12316715542521994 || F1: 0.2

## 1.3 Scaled count pids and frequencies

In [24]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_one_hot_more_scaled, y_train)

[[10736 12111]
 [ 9699 13148]]
Positive
Precision: 0.5205273367908468 || Recall: 0.5754803694139274 || F1: 0.5466262004739533
Negative
Precision: 0.5253731343283582 || Recall: 0.46990852190659604 || F1: 0.49609537452058594
[[10746 12101]
 [ 9774 13072]]
Positive
Precision: 0.5192865371628332 || Recall: 0.5721789372319005 || F1: 0.5444511547512444
Negative
Precision: 0.5236842105263158 || Recall: 0.4703462161334092 || F1: 0.49558419996771735
[[10614 12233]
 [ 9805 13041]]
Positive
Precision: 0.5159848065205349 || Recall: 0.570822025737547 || F1: 0.5420199501246883
Negative
Precision: 0.5198099809001421 || Recall: 0.46456865233947564 || F1: 0.49063930106781306
[[11833 11013]
 [10452 12395]]
Positive
Precision: 0.5295198222829802 || Recall: 0.5425219941348973 || F1: 0.5359420603178034
Negative
Precision: 0.530984967466906 || Recall: 0.5179462487962881 || F1: 0.5243845693647382
[[11985 10861]
 [10391 12456]]
Positive
Precision: 0.5342025131878029 || Recall: 0.5451919289184576 || F1: 0.5396

## 1.4 Scaled Counts

In [25]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_count_scaled, y_train)

[[ 7113 15734]
 [ 3174 19673]]
Positive
Precision: 0.5556245940068348 || Recall: 0.8610758524095067 || F1: 0.6754214302880489
Negative
Precision: 0.6914552347623214 || Recall: 0.3113319035321924 || F1: 0.4293474980382689
[[ 7113 15734]
 [ 3235 19611]]
Positive
Precision: 0.5548450983165936 || Recall: 0.8583997198634334 || F1: 0.6740217559416405
Negative
Precision: 0.687379203710862 || Recall: 0.3113319035321924 || F1: 0.4285585178490737
[[ 7300 15547]
 [ 3327 19519]]
Positive
Precision: 0.5566360577197285 || Recall: 0.8543727567189005 || F1: 0.6740917253764332
Negative
Precision: 0.6869295191493366 || Recall: 0.3195167855735983 || F1: 0.4361594073011891
[[ 8904 13942]
 [ 4282 18565]]
Positive
Precision: 0.5711077614052358 || Recall: 0.8125793320786099 || F1: 0.6707735664992592
Negative
Precision: 0.6752616411345367 || Recall: 0.38973999824914646 || F1: 0.4942273534635879
[[ 8903 13943]
 [ 4176 18671]]
Positive
Precision: 0.5724842092352977 || Recall: 0.8172188908828293 || F1: 0.6733019

## 1.5 Top 10

In [26]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_top_10, y_train)

[[17565  5282]
 [13945  8902]]
Positive
Precision: 0.6276085730400451 || Recall: 0.38963540070906466 || F1: 0.480786368178013
Negative
Precision: 0.5574420818787686 || Recall: 0.7688099093972951 || F1: 0.6462829074452233
[[14638  8209]
 [12716 10130]]
Positive
Precision: 0.5523747205409237 || Recall: 0.4434036592839009 || F1: 0.4919266723321598
Negative
Precision: 0.5351319733859765 || Recall: 0.6406968092090866 || F1: 0.583175633951515
[[14677  8170]
 [12413 10433]]
Positive
Precision: 0.560823523087674 || Recall: 0.45666637485774314 || F1: 0.503413833868127
Negative
Precision: 0.5417866371354744 || Recall: 0.6424038166936578 || F1: 0.5878206540240702
[[17266  5580]
 [13353  9494]]
Positive
Precision: 0.6298261907920923 || Recall: 0.415546898936403 || F1: 0.5007251918462066
Negative
Precision: 0.5638982331232242 || Recall: 0.7557559310163705 || F1: 0.6458804825586832
[[17505  5341]
 [13338  9509]]
Positive
Precision: 0.6403367003367003 || Recall: 0.41620344027662276 || F1: 0.504496379

# 2. Decision Trees

## 2.1 Only count columns

In [27]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_one_hot, y_train)

[[11504 11343]
 [ 6120 16727]]
Positive
Precision: 0.5959030993943712 || Recall: 0.7321311331903532 || F1: 0.6570300685429228
Negative
Precision: 0.6527462551066727 || Recall: 0.5035234385258458 || F1: 0.5685058436905438
[[11455 11392]
 [ 6262 16584]]
Positive
Precision: 0.5927938232770946 || Recall: 0.7259038781405935 || F1: 0.6526307504623982
Negative
Precision: 0.646554157024327 || Recall: 0.5013787368144614 || F1: 0.5647865102060942
[[11524 11323]
 [ 6432 16414]]
Positive
Precision: 0.5917727223564192 || Recall: 0.718462750590913 || F1: 0.6489927445979875
Negative
Precision: 0.6417910447761194 || Recall: 0.5043988269794721 || F1: 0.5648604269293924
[[11425 11421]
 [ 6438 16409]]
Positive
Precision: 0.5896155228171038 || Recall: 0.7182124567776951 || F1: 0.6475916096059355
Negative
Precision: 0.6395902144096737 || Recall: 0.500087542677055 || F1: 0.5613009408238965
[[11480 11366]
 [ 6202 16645]]
Positive
Precision: 0.5942308378851165 || Recall: 0.7285420405304854 || F1: 0.6545676196

## 2.2 Only count columns with frequencies

In [28]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_one_hot_more, y_train)

[[11812 11035]
 [ 6554 16293]]
Positive
Precision: 0.5962016978922716 || Recall: 0.7131352037466626 || F1: 0.6494469357249626
Negative
Precision: 0.643144941740172 || Recall: 0.5170044207116908 || F1: 0.5732171887511223
[[11708 11139]
 [ 6688 16158]]
Positive
Precision: 0.5919331794702715 || Recall: 0.7072572879278648 || F1: 0.6444767963624035
Negative
Precision: 0.6364427049358556 || Recall: 0.512452400752834 || F1: 0.5677569526950028
[[11789 11058]
 [ 6899 15947]]
Positive
Precision: 0.590520274023329 || Recall: 0.6980215354985555 || F1: 0.6397865639606025
Negative
Precision: 0.6308326198630136 || Recall: 0.5159977239900205 || F1: 0.5676658240038522
[[11580 11266]
 [ 6693 16154]]
Positive
Precision: 0.5891320204230489 || Recall: 0.7070512539939598 || F1: 0.6427278333698053
Negative
Precision: 0.6337218847479889 || Recall: 0.5068721001488226 || F1: 0.563243269534765
[[11742 11104]
 [ 6540 16307]]
Positive
Precision: 0.5949071540622378 || Recall: 0.713747975664201 || F1: 0.648931513390

## 2.3 Scaled count pids and frequencies

In [29]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_one_hot_more_scaled, y_train)

[[11808 11039]
 [ 6557 16290]]
Positive
Precision: 0.5960701086757657 || Recall: 0.7130038954786186 || F1: 0.6493144132653061
Negative
Precision: 0.6429621562755241 || Recall: 0.5168293430209655 || F1: 0.5730369795205279
[[11710 11137]
 [ 6695 16151]]
Positive
Precision: 0.5918718850776898 || Recall: 0.7069508885581721 || F1: 0.6443132405154187
Negative
Precision: 0.6362401521325727 || Recall: 0.5125399395981967 || F1: 0.5677300494521478
[[11789 11058]
 [ 6888 15958]]
Positive
Precision: 0.5906870002961209 || Recall: 0.6985030202223584 || F1: 0.6400866391239822
Negative
Precision: 0.6312041548428549 || Recall: 0.5159977239900205 || F1: 0.5678162026779694
[[11586 11260]
 [ 6697 16150]]
Positive
Precision: 0.5892010215249909 || Recall: 0.7068761763032345 || F1: 0.6426965397855026
Negative
Precision: 0.6337034403544276 || Recall: 0.5071347281799877 || F1: 0.5633980889396776
[[11729 11117]
 [ 6532 16315]]
Positive
Precision: 0.5947433654126567 || Recall: 0.7140981310456516 || F1: 0.6489786

## 2.4 Scaled Counts

In [30]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_count_scaled, y_train)

[[11507 11340]
 [ 6119 16728]]
Positive
Precision: 0.595981188542112 || Recall: 0.7321749026130345 || F1: 0.6570951585976628
Negative
Precision: 0.6528423919210258 || Recall: 0.5036547467938898 || F1: 0.5686259975786326
[[11456 11391]
 [ 6261 16585]]
Positive
Precision: 0.5928295682013154 || Recall: 0.725947649479121 || F1: 0.6526701034984849
Negative
Precision: 0.6466105999887114 || Recall: 0.5014225062371427 || F1: 0.5648358150083818
[[11515 11332]
 [ 6427 16419]]
Positive
Precision: 0.5916543547980253 || Recall: 0.7186816072835507 || F1: 0.6490108109176433
Negative
Precision: 0.64179021290826 || Recall: 0.5040049021753403 || F1: 0.56461300840913
[[11430 11416]
 [ 6441 16406]]
Positive
Precision: 0.5896772338437208 || Recall: 0.7180811485096512 || F1: 0.6475754406047091
Negative
Precision: 0.6395836830619439 || Recall: 0.5003063993696927 || F1: 0.5614362551268511
[[11480 11366]
 [ 6201 16646]]
Positive
Precision: 0.5942453234328146 || Recall: 0.7285858099531667 || F1: 0.6545940738119

## 2.5 Top 10

In [31]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_top_10, y_train)

[[11074 11773]
 [ 6111 16736]]
Positive
Precision: 0.5870426882738784 || Recall: 0.7325250579944851 || F1: 0.6517641560869227
Negative
Precision: 0.6443991853360489 || Recall: 0.4847025867728805 || F1: 0.5532573940847322
[[10975 11872]
 [ 6254 16592]]
Positive
Precision: 0.582911748173131 || Recall: 0.7262540488488138 || F1: 0.6467355291366206
Negative
Precision: 0.6370073712925881 || Recall: 0.4803694139274303 || F1: 0.5477093522307616
[[10966 11881]
 [ 6411 16435]]
Positive
Precision: 0.5804139002683995 || Recall: 0.7193819486999913 || F1: 0.6424690199757633
Negative
Precision: 0.6310640501812741 || Recall: 0.47997548912329846 || F1: 0.5452466189339698
[[10813 12033]
 [ 6266 16581]]
Positive
Precision: 0.579471587334871 || Recall: 0.7257407974788812 || F1: 0.6444103301529314
Negative
Precision: 0.6331166930148135 || Recall: 0.4732994834982054 || F1: 0.541665623043206
[[11029 11817]
 [ 6275 16572]]
Positive
Precision: 0.5837472260382542 || Recall: 0.7253468726747494 || F1: 0.646888906

# 3. Random Forest

## 3.1 Only count columns

In [32]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_one_hot, y_train)

[[11682 11165]
 [ 5336 17511]]
Positive
Precision: 0.6106500209234202 || Recall: 0.766446360572504 || F1: 0.6797352638627409
Negative
Precision: 0.6864496415559995 || Recall: 0.5113143957631199 || F1: 0.5860780132948702
[[11589 11258]
 [ 5442 17404]]
Positive
Precision: 0.6072151280441002 || Recall: 0.7617963757331699 || F1: 0.675778519841578
Negative
Precision: 0.6804650343491281 || Recall: 0.5072438394537576 || F1: 0.5812227293244395
[[11619 11228]
 [ 5637 17209]]
Positive
Precision: 0.605162288567711 || Recall: 0.7532609647203011 || F1: 0.6711385839361972
Negative
Precision: 0.6733310152990264 || Recall: 0.508556922134197 || F1: 0.579457895918011
[[11567 11279]
 [ 5605 17242]]
Positive
Precision: 0.6045370078188002 || Recall: 0.7546723858712303 || F1: 0.6713128796137673
Negative
Precision: 0.6735965525273702 || Recall: 0.5063030727479646 || F1: 0.5780898595631965
[[11534 11312]
 [ 5357 17490]]
Positive
Precision: 0.6072494965627387 || Recall: 0.7655272026961965 || F1: 0.677263838602

## 3.2 Only count columns with frequencies

In [33]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_one_hot_more, y_train)

[[11988 10859]
 [ 5633 17214]]
Positive
Precision: 0.6131870480532896 || Recall: 0.7534468420361535 || F1: 0.6761194029850746
Negative
Precision: 0.6803246126780546 || Recall: 0.5247078391036022 || F1: 0.5924681229613522
[[11893 10954]
 [ 5767 17079]]
Positive
Precision: 0.6092462454963793 || Recall: 0.747570690711722 || F1: 0.6713575345427387
Negative
Precision: 0.6734428086070215 || Recall: 0.5205497439488773 || F1: 0.5872071493815884
[[11871 10976]
 [ 5897 16949]]
Positive
Precision: 0.6069471799462847 || Recall: 0.7418804167031428 || F1: 0.6676646116877745
Negative
Precision: 0.6681112111661414 || Recall: 0.5195868166498884 || F1: 0.5845623538101686
[[11744 11102]
 [ 5785 17062]]
Positive
Precision: 0.6058088339724471 || Recall: 0.7467938897885937 || F1: 0.6689537550724354
Negative
Precision: 0.6699754692224315 || Recall: 0.5140505996673378 || F1: 0.5817461300309598
[[11856 10990]
 [ 5660 17187]]
Positive
Precision: 0.6099655747595557 || Recall: 0.752265067623758 || F1: 0.673682972

## 3.3 Scaled count pids and frequencies

In [34]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_one_hot_more_scaled, y_train)

[[11995 10852]
 [ 5631 17216]]
Positive
Precision: 0.6133675359840388 || Recall: 0.7535343808815161 || F1: 0.676264362172248
Negative
Precision: 0.6805287643254283 || Recall: 0.5250142250623714 || F1: 0.5927408395720604
[[11895 10952]
 [ 5773 17073]]
Positive
Precision: 0.6092060660124888 || Recall: 0.7473080626805567 || F1: 0.6712272217963083
Negative
Precision: 0.6732510753905365 || Recall: 0.5206372827942399 || F1: 0.5871899296556832
[[11876 10971]
 [ 5892 16954]]
Positive
Precision: 0.6071262309758281 || Recall: 0.7420992733957804 || F1: 0.6678615745208878
Negative
Precision: 0.6683926159387663 || Recall: 0.519805663763295 || F1: 0.584808568262957
[[11759 11087]
 [ 5796 17051]]
Positive
Precision: 0.6059776814272514 || Recall: 0.7463124261390992 || F1: 0.6688633911934883
Negative
Precision: 0.6698376530902876 || Recall: 0.5147071697452508 || F1: 0.5821143041013835
[[11852 10994]
 [ 5665 17182]]
Positive
Precision: 0.6098097671777399 || Recall: 0.7520462205103514 || F1: 0.6735001861

## 3.4 Scaled Counts

In [35]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_count_scaled, y_train)

[[11670 11177]
 [ 5340 17507]]
Positive
Precision: 0.6103402593780505 || Recall: 0.7662712828817788 || F1: 0.6794744910830374
Negative
Precision: 0.6860670194003528 || Recall: 0.5107891626909441 || F1: 0.5855934967508845
[[11593 11254]
 [ 5445 17401]]
Positive
Precision: 0.6072587681032978 || Recall: 0.7616650617175873 || F1: 0.6757538688569155
Negative
Precision: 0.6804202371170325 || Recall: 0.5074189171444828 || F1: 0.5813212987338598
[[11608 11239]
 [ 5637 17209]]
Positive
Precision: 0.6049282902137233 || Recall: 0.7532609647203011 || F1: 0.670994658244629
Negative
Precision: 0.6731226442447086 || Recall: 0.5080754584847026 || F1: 0.5790681432704778
[[11580 11266]
 [ 5607 17240]]
Positive
Precision: 0.6047849575527959 || Recall: 0.7545848470258677 || F1: 0.6714310751075886
Negative
Precision: 0.6737650549834177 || Recall: 0.5068721001488226 || F1: 0.5785227187570253
[[11535 11311]
 [ 5360 17487]]
Positive
Precision: 0.6072296687269949 || Recall: 0.7653958944281525 || F1: 0.67720011

## 3.5 Top 10

In [36]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_top_10, y_train)

[[10972 11875]
 [ 5456 17391]]
Positive
Precision: 0.5942390487254835 || Recall: 0.7611940298507462 || F1: 0.667434229462898
Negative
Precision: 0.6678841003165328 || Recall: 0.48023810565938635 || F1: 0.5587269255251432
[[10884 11963]
 [ 5602 17244]]
Positive
Precision: 0.5904064094223987 || Recall: 0.7547929615687647 || F1: 0.6625554723070716
Negative
Precision: 0.6601965303894213 || Recall: 0.4763863964634307 || F1: 0.5534284188849058
[[10878 11969]
 [ 5796 17050]]
Positive
Precision: 0.587546090492436 || Recall: 0.7463013218944236 || F1: 0.6574761399787911
Negative
Precision: 0.6523929471032746 || Recall: 0.4761237799273428 || F1: 0.5504921434174237
[[10701 12145]
 [ 5593 17254]]
Positive
Precision: 0.5868907105683867 || Recall: 0.7551976189434061 || F1: 0.660490755273131
Negative
Precision: 0.6567448140419786 || Recall: 0.46839709358312176 || F1: 0.5468063362289217
[[10960 11886]
 [ 5611 17236]]
Positive
Precision: 0.5918549550168257 || Recall: 0.7544097693351425 || F1: 0.66331851

# 4. Neural Network

## 4.1 Only count columns

In [37]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_one_hot, y_train)

[[12073 10774]
 [ 5442 17405]]
Positive
Precision: 0.6176585400475532 || Recall: 0.7618068017682846 || F1: 0.6822012307451103
Negative
Precision: 0.689294890094205 || Recall: 0.528428240031514 || F1: 0.5982359645210843
[[11817 11030]
 [ 5438 17408]]
Positive
Precision: 0.6121386876714255 || Recall: 0.7619714610872801 || F1: 0.6788862023243116
Negative
Precision: 0.6848449724717474 || Recall: 0.5172232678250974 || F1: 0.5893471647299386
[[11618 11229]
 [ 5335 17511]]
Positive
Precision: 0.6092901878914405 || Recall: 0.7664799089556159 || F1: 0.678905129298647
Negative
Precision: 0.6853064354391553 || Recall: 0.5085131527115158 || F1: 0.583819095477387
[[11377 11469]
 [ 5142 17705]]
Positive
Precision: 0.6068759854665112 || Recall: 0.7749376285726791 || F1: 0.6806866457776667
Negative
Precision: 0.6887220776076034 || Recall: 0.4979865184277335 || F1: 0.5780261653753335
[[12047 10799]
 [ 5549 17298]]
Positive
Precision: 0.6156529166814962 || Recall: 0.757123473541384 || F1: 0.679098618090

## 4.2 Only count columns with frequencies

In [38]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_one_hot_more, y_train)

[[11821 11026]
 [ 5112 17735]]
Positive
Precision: 0.6166336358262926 || Recall: 0.7762507112531186 || F1: 0.6872965431716014
Negative
Precision: 0.6981042933916022 || Recall: 0.5173983455158226 || F1: 0.5943187531422826
[[12229 10618]
 [ 5755 17091]]
Positive
Precision: 0.6168032047349237 || Recall: 0.7480959467740523 || F1: 0.6761349025813471
Negative
Precision: 0.6799933274021353 || Recall: 0.5352562699697991 || F1: 0.5990056574661409
[[11868 10979]
 [ 5547 17299]]
Positive
Precision: 0.6117476483485395 || Recall: 0.757200385187779 || F1: 0.6767467334324387
Negative
Precision: 0.6814814814814815 || Recall: 0.5194555083818444 || F1: 0.5895385226764691
[[11867 10979]
 [ 5498 17349]]
Positive
Precision: 0.6124329285512568 || Recall: 0.7593557140981311 || F1: 0.6780263800683928
Negative
Precision: 0.6833861215087821 || Recall: 0.5194344743062242 || F1: 0.5902365024495785
[[12058 10788]
 [ 5473 17374]]
Positive
Precision: 0.6169306157233151 || Recall: 0.7604499496651639 || F1: 0.68121311

## 4.3 Scaled count pids and frequencies

In [39]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_one_hot_more_scaled, y_train)

[[10954 11893]
 [ 4690 18157]]
Positive
Precision: 0.6042262895174709 || Recall: 0.7947214076246334 || F1: 0.6865039605270621
Negative
Precision: 0.700204551265661 || Recall: 0.4794502560511227 || F1: 0.5691720142370944
[[ 8650 14197]
 [ 3313 19533]]
Positive
Precision: 0.5790987251704713 || Recall: 0.8549855554582859 || F1: 0.6905048076923076
Negative
Precision: 0.7230627768954275 || Recall: 0.3786055061933733 || F1: 0.49698362539500135
[[17621  5226]
 [12109 10737]]
Positive
Precision: 0.6726179289607217 || Recall: 0.46997286177011294 || F1: 0.5533252596047309
Negative
Precision: 0.5927009754456778 || Recall: 0.7712609970674487 || F1: 0.6702930939384142
[[17354  5492]
 [11800 11047]]
Positive
Precision: 0.6679363927686075 || Recall: 0.48352081236048494 || F1: 0.5609607474737215
Negative
Precision: 0.5952527954997598 || Recall: 0.7596078088067933 || F1: 0.6674615384615384
[[17446  5400]
 [11793 11054]]
Positive
Precision: 0.671812325270451 || Recall: 0.4838271983192542 || F1: 0.562530

## 4.4 Scaled Counts

In [40]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_count_scaled, y_train)

[[17793  5054]
 [12170 10677]]
Positive
Precision: 0.6787235395079779 || Recall: 0.46732612596839845 || F1: 0.5535279174659132
Negative
Precision: 0.5938323932850516 || Recall: 0.7787893377686348 || F1: 0.6738496496875591
[[17992  4855]
 [12500 10346]]
Positive
Precision: 0.6806131175580554 || Recall: 0.45285826840584786 || F1: 0.5438536546902516
Negative
Precision: 0.5900564082382265 || Recall: 0.7874994528822165 || F1: 0.6746283207409213
[[11154 11693]
 [ 4997 17849]]
Positive
Precision: 0.6041906438291247 || Recall: 0.7812746213779217 || F1: 0.6814155913567993
Negative
Precision: 0.6906073927311003 || Recall: 0.48820414058738565 || F1: 0.572029334837684
[[16107  6739]
 [10177 12670]]
Positive
Precision: 0.6527899428100365 || Recall: 0.554558585372259 || F1: 0.59967815221507
Negative
Precision: 0.6128062699741288 || Recall: 0.7050249496629607 || F1: 0.6556889883981275
[[15986  6860]
 [10004 12843]]
Positive
Precision: 0.6518296706085368 || Recall: 0.5621306954961264 || F1: 0.60366627

## 4.5 Top 10

In [41]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_top_10, y_train)

[[10926 11921]
 [ 5062 17785]]
Positive
Precision: 0.5987005992055477 || Recall: 0.7784391823871843 || F1: 0.6768405229006909
Negative
Precision: 0.6833875406554916 || Recall: 0.47822471221604584 || F1: 0.5626882966396292
[[10961 11886]
 [ 5407 17439]]
Positive
Precision: 0.5946803069053709 || Recall: 0.7633283725816336 || F1: 0.6685323263882235
Negative
Precision: 0.6696603128054741 || Recall: 0.4797566420098919 || F1: 0.5590207828637002
[[11654 11193]
 [ 6276 16570]]
Positive
Precision: 0.5968375175593416 || Recall: 0.7252910794012081 || F1: 0.6548242407476931
Negative
Precision: 0.6499721137757948 || Recall: 0.5100888519280431 || F1: 0.5715967334526817
[[10592 12254]
 [ 5171 17676]]
Positive
Precision: 0.5905780153691947 || Recall: 0.773668315314921 || F1: 0.6698372397066904
Negative
Precision: 0.6719533083803845 || Recall: 0.4636260176836208 || F1: 0.5486803595016706
[[17838  5008]
 [12662 10185]]
Positive
Precision: 0.6703745145790825 || Recall: 0.44579157000919156 || F1: 0.535488

# 5. Logistic Regression

## 5.1 Only count columns

In [42]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_one_hot, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[19698  3149]
 [15634  7213]]
Positive
Precision: 0.696101138776298 || Recall: 0.3157088458003239 || F1: 0.434400313168117
Negative
Precision: 0.5575116042114797 || Recall: 0.8621700879765396 || F1: 0.6771515495281804


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[19622  3225]
 [15555  7291]]
Positive
Precision: 0.6933244579688095 || Recall: 0.3191368292042371 || F1: 0.43708410766740613
Negative
Precision: 0.5578076584131677 || Recall: 0.8588436118527597 || F1: 0.6763408244864194


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[19683  3164]
 [15662  7184]]
Positive
Precision: 0.6942404329339003 || Recall: 0.3144532959817911 || F1: 0.432849310116286
Negative
Precision: 0.5568821615504315 || Recall: 0.8615135466363198 || F1: 0.6764847401704701


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[19502  3344]
 [15301  7546]]
Positive
Precision: 0.692929292929293 || Recall: 0.3302840635532017 || F1: 0.4473426801434626
Negative
Precision: 0.5603539924719133 || Recall: 0.8536286439639325 || F1: 0.6765772172977849


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[19748  3098]
 [15589  7258]]
Positive
Precision: 0.7008497489378138 || Recall: 0.3176784698209831 || F1: 0.43718941059542815
Negative
Precision: 0.5588476667515635 || Recall: 0.8643963932417054 || F1: 0.6788237113933623
---------------------------
Positive
Overall Precision: 0.6954890143092228 (+/- 0.0028951202139414797) || Overall Recall: 0.3194523008721074 (+/- 0.005649364345102505) || Overall F1: 0.43777316433813995 (+/- 0.005059874814442095)
Negative
Overall Precision: 0.5582806166797111 (+/- 0.001215827241549845) || Overall Recall: 0.8601104567342513 (+/- 0.0036932042081816775) || Overall F1: 0.6770756085752435 (+/- 0.0009164122964521088)


## 5.2 Only count columns with frequencies

In [43]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_one_hot_more, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[19659  3188]
 [15668  7179]]
Positive
Precision: 0.6924857721616668 || Recall: 0.3142206854291592 || F1: 0.43228758957066304
Negative
Precision: 0.5564865400401959 || Recall: 0.8604630804919683 || F1: 0.6758689448894696


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[19332  3515]
 [15179  7667]]
Positive
Precision: 0.6856555177964586 || Recall: 0.3355948524905892 || F1: 0.45062889385212174
Negative
Precision: 0.5601692214076671 || Recall: 0.8461504792751784 || F1: 0.6740820809651662


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[19910  2937]
 [16027  6819]]
Positive
Precision: 0.6989544895448955 || Recall: 0.2984767574192419 || F1: 0.4183178946076928
Negative
Precision: 0.554025099479645 || Recall: 0.8714492055849783 || F1: 0.6773952095808383


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[19805  3041]
 [15678  7169]]
Positive
Precision: 0.702154750244858 || Recall: 0.31378299120234604 || F1: 0.43373566869346886
Negative
Precision: 0.5581546092494998 || Recall: 0.8668913595377746 || F1: 0.6790790172984279


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[20013  2833]
 [16113  6734]]
Positive
Precision: 0.7038779136615448 || Recall: 0.29474329233597407 || F1: 0.4154994755352625
Negative
Precision: 0.5539777445607043 || Recall: 0.8759957979515014 || F1: 0.6787288882859662
---------------------------
Positive
Overall Precision: 0.6966256886818847 (+/- 0.006724167761621321) || Overall Recall: 0.31136371577546207 (+/- 0.014446282059293228) || Overall F1: 0.4300939044518418 (+/- 0.012581194332115853)
Negative
Overall Precision: 0.5565626429475424 (+/- 0.002394509681327576) || Overall Recall: 0.8641899845682801 (+/- 0.010379638831369862) || Overall F1: 0.6770308282039735 (+/- 0.0018577590538819229)


## 5.3 Scaled count pids and frequencies

In [44]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_one_hot_more_scaled, y_train)

[[17984  4863]
 [15906  6941]]
Positive
Precision: 0.5880210098271772 || Recall: 0.30380356283100624 || F1: 0.4006233586332284
Negative
Precision: 0.5306580112127471 || Recall: 0.787149297500766 || F1: 0.6339425771542381
[[18081  4766]
 [15771  7075]]
Positive
Precision: 0.5975002111308166 || Recall: 0.3096822200822901 || F1: 0.40793380805489093
Negative
Precision: 0.5341191066997518 || Recall: 0.7913949315008535 || F1: 0.637789026261486
[[18102  4745]
 [15806  7040]]
Positive
Precision: 0.5973695375477301 || Recall: 0.3081502232338265 || F1: 0.40657214634287203
Negative
Precision: 0.5338563170933113 || Recall: 0.7923140893771611 || F1: 0.6378997445159017
[[18123  4723]
 [15852  6995]]
Positive
Precision: 0.5969448711384195 || Recall: 0.30616711165579724 || F1: 0.40474468392882973
Negative
Precision: 0.5334216335540839 || Recall: 0.7932679681344655 || F1: 0.6378979602611711
[[18008  4838]
 [15845  7002]]
Positive
Precision: 0.5913851351351351 || Recall: 0.30647349761456644 || F1: 0.403

## 5.4 Scaled Counts

In [45]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_count_scaled, y_train)

[[21413  1434]
 [19513  3334]]
Positive
Precision: 0.699244966442953 || Recall: 0.14592725521950364 || F1: 0.24146297302190836
Negative
Precision: 0.5232126276694522 || Recall: 0.9372346478749946 || F1: 0.6715381117400782
[[21477  1370]
 [19363  3483]]
Positive
Precision: 0.717700391510406 || Recall: 0.15245557209139454 || F1: 0.2514892234376692
Negative
Precision: 0.5258814887365328 || Recall: 0.9400358909265987 || F1: 0.6744547552875784
[[21461  1386]
 [19522  3324]]
Positive
Precision: 0.7057324840764331 || Recall: 0.14549592926551694 || F1: 0.24125417331978516
Negative
Precision: 0.5236561501110216 || Recall: 0.9393355801636977 || F1: 0.672442425191916
[[21492  1354]
 [19324  3523]]
Positive
Precision: 0.7223703096165676 || Recall: 0.15419967610627217 || F1: 0.25414803058721686
Negative
Precision: 0.5265582124656997 || Recall: 0.9407336076337215 || F1: 0.6751908516854638
[[21481  1365]
 [19465  3382]]
Positive
Precision: 0.7124499684010954 || Recall: 0.14802818750820676 || F1: 0.24

## 5.5 Top 10

In [46]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_top_10, y_train)

[[20565  2282]
 [17218  5629]]
Positive
Precision: 0.7115408924282645 || Recall: 0.2463780802731212 || F1: 0.36601859678782755
Negative
Precision: 0.5442924066379059 || Recall: 0.9001181774412396 || F1: 0.6783770410687778
[[20499  2348]
 [17158  5688]]
Positive
Precision: 0.7078148332503733 || Recall: 0.248971373544603 || F1: 0.3683699242277054
Negative
Precision: 0.5443609421887033 || Recall: 0.8972293955442727 || F1: 0.6776080920269735
[[20568  2279]
 [17220  5626]]
Positive
Precision: 0.7117014547754585 || Recall: 0.246257550555896 || F1: 0.365906799778869
Negative
Precision: 0.5442997777072086 || Recall: 0.9002494857092835 || F1: 0.6784200544240125
[[20512  2334]
 [16968  5879]]
Positive
Precision: 0.7158163886521368 || Recall: 0.2573204359434499 || F1: 0.3785576303927881
Negative
Precision: 0.5472785485592316 || Recall: 0.8978376958767399 || F1: 0.6800384577130922
[[20556  2290]
 [17154  5693]]
Positive
Precision: 0.7131404233997244 || Recall: 0.24917932332472534 || F1: 0.36931560

In [69]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
log_reg.fit(X_train_one_hot, y_train)
predict_test = log_reg.predict(X_test_one_hot)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [72]:
confusion_matrix(y_test, predict_test)

array([[296692,  45416],
       [  4119,   1894]])

In [73]:
f1_score(y_test, predict_test)

0.07103876376047859

# 6. XGBoost

## 6.1 Only count columns

In [47]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot, y_train)

[[11741 11106]
 [ 5010 17837]]
Positive
Precision: 0.6162802750233217 || Recall: 0.7807151923666127 || F1: 0.6888202355667118
Negative
Precision: 0.7009133783057728 || Recall: 0.5138967917013174 || F1: 0.5930097479670691
[[11593 11254]
 [ 5165 17681]]
Positive
Precision: 0.611059270779333 || Recall: 0.7739210365052963 || F1: 0.6829145825688958
Negative
Precision: 0.6917889963002745 || Recall: 0.5074189171444828 || F1: 0.5854311324327736
[[11673 11174]
 [ 5346 17500]]
Positive
Precision: 0.6103089907233034 || Recall: 0.765998424231813 || F1: 0.6793478260869565
Negative
Precision: 0.6858804865150714 || Recall: 0.5109204709589881 || F1: 0.5856117995284202
[[11412 11434]
 [ 5095 17752]]
Positive
Precision: 0.6082368258754197 || Recall: 0.7769947914387009 || F1: 0.6823362097130666
Negative
Precision: 0.6913430665778155 || Recall: 0.4995185152761972 || F1: 0.5799811958427566
[[11663 11183]
 [ 5154 17693]]
Positive
Precision: 0.6127233688876575 || Recall: 0.7744123955005033 || F1: 0.684144384

## 6.2 Only count columns with frequencies

In [48]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot_more, y_train)

[[11952 10895]
 [ 5240 17607]]
Positive
Precision: 0.6177461230790822 || Recall: 0.7706482251499103 || F1: 0.6857777171902082
Negative
Precision: 0.695207073057236 || Recall: 0.5231321398870749 || F1: 0.5970179075401483
[[11956 10891]
 [ 5464 17382]]
Positive
Precision: 0.61479149718813 || Recall: 0.7608334062855642 || F1: 0.6800602515698663
Negative
Precision: 0.686337543053961 || Recall: 0.5233072175778002 || F1: 0.593836143740532
[[11707 11140]
 [ 5386 17460]]
Positive
Precision: 0.6104895104895105 || Recall: 0.7642475706907117 || F1: 0.6787699723982428
Negative
Precision: 0.6849002515649681 || Recall: 0.5124086313301528 || F1: 0.586229344016024
[[11689 11157]
 [ 5357 17490]]
Positive
Precision: 0.6105351345690648 || Recall: 0.7655272026961965 || F1: 0.6793024430030684
Negative
Precision: 0.6857327232195236 || Recall: 0.5116431760483235 || F1: 0.5860322871753735
[[12011 10835]
 [ 5449 17398]]
Positive
Precision: 0.6162292352920341 || Recall: 0.7615004158095154 || F1: 0.6812059514487

## 6.3 Scaled count pids and frequencies

In [49]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot_more_scaled, y_train)

[[11952 10895]
 [ 5240 17607]]
Positive
Precision: 0.6177461230790822 || Recall: 0.7706482251499103 || F1: 0.6857777171902082
Negative
Precision: 0.695207073057236 || Recall: 0.5231321398870749 || F1: 0.5970179075401483
[[11956 10891]
 [ 5464 17382]]
Positive
Precision: 0.61479149718813 || Recall: 0.7608334062855642 || F1: 0.6800602515698663
Negative
Precision: 0.686337543053961 || Recall: 0.5233072175778002 || F1: 0.593836143740532
[[11707 11140]
 [ 5386 17460]]
Positive
Precision: 0.6104895104895105 || Recall: 0.7642475706907117 || F1: 0.6787699723982428
Negative
Precision: 0.6849002515649681 || Recall: 0.5124086313301528 || F1: 0.586229344016024
[[11689 11157]
 [ 5356 17491]]
Positive
Precision: 0.6105487294051941 || Recall: 0.7655709721188777 || F1: 0.6793280901058355
Negative
Precision: 0.6857729539454386 || Recall: 0.5116431760483235 || F1: 0.5860469780150912
[[12011 10835]
 [ 5449 17398]]
Positive
Precision: 0.6162292352920341 || Recall: 0.7615004158095154 || F1: 0.6812059514487

## 6.4 Scaled counts

In [50]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_count_scaled, y_train)

[[11741 11106]
 [ 5010 17837]]
Positive
Precision: 0.6162802750233217 || Recall: 0.7807151923666127 || F1: 0.6888202355667118
Negative
Precision: 0.7009133783057728 || Recall: 0.5138967917013174 || F1: 0.5930097479670691
[[11593 11254]
 [ 5165 17681]]
Positive
Precision: 0.611059270779333 || Recall: 0.7739210365052963 || F1: 0.6829145825688958
Negative
Precision: 0.6917889963002745 || Recall: 0.5074189171444828 || F1: 0.5854311324327736
[[11673 11174]
 [ 5346 17500]]
Positive
Precision: 0.6103089907233034 || Recall: 0.765998424231813 || F1: 0.6793478260869565
Negative
Precision: 0.6858804865150714 || Recall: 0.5109204709589881 || F1: 0.5856117995284202
[[11412 11434]
 [ 5095 17752]]
Positive
Precision: 0.6082368258754197 || Recall: 0.7769947914387009 || F1: 0.6823362097130666
Negative
Precision: 0.6913430665778155 || Recall: 0.4995185152761972 || F1: 0.5799811958427566
[[11663 11183]
 [ 5154 17693]]
Positive
Precision: 0.6127233688876575 || Recall: 0.7744123955005033 || F1: 0.684144384

## 6.5 Top 10

In [51]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_top_10, y_train)

[[10885 11962]
 [ 5051 17796]]
Positive
Precision: 0.5980240607567713 || Recall: 0.7789206460366788 || F1: 0.6765896777872826
Negative
Precision: 0.6830446787148594 || Recall: 0.476430165886112 || F1: 0.5613284170899622
[[10701 12146]
 [ 5237 17609]]
Positive
Precision: 0.591799697529827 || Recall: 0.7707695001313141 || F1: 0.6695309975095531
Negative
Precision: 0.6714142301417995 || Recall: 0.46837659211275 || F1: 0.5518112672424906
[[10593 12254]
 [ 5209 17637]]
Positive
Precision: 0.5900438259007728 || Recall: 0.7719950976100849 || F1: 0.6688662608794584
Negative
Precision: 0.6703581825085432 || Recall: 0.46364949446316805 || F1: 0.5481642474578903
[[10576 12270]
 [ 5208 17639]]
Positive
Precision: 0.5897555919622856 || Recall: 0.7720488466757124 || F1: 0.6687011903859276
Negative
Precision: 0.670045615813482 || Recall: 0.46292567626718023 || F1: 0.5475537147294848
[[10793 12053]
 [ 5175 17672]]
Positive
Precision: 0.5945164003364172 || Recall: 0.7734932376241958 || F1: 0.6722970402

## 6.6 Grid Search

In [29]:
# initialize parameters
learning_rate = [0.05, 0.15, 0.25]
max_depth = [3,8,12]
min_child_weight = [1,3,5]
gamma = [0, 0.2, 0.4]
colsample_bytree = [0.3, 0.5, 0.7]

parameters = []

# loop through all parameters
for rate in learning_rate:
    for depth in max_depth:
        for weight in min_child_weight:
            for gam in gamma:
                for bytree in colsample_bytree:
                    parameter_combination = [rate, depth, weight, gam, bytree]
                    
                    parameters.append(parameter_combination)

In [30]:
def get_cross_validated_two_class_confusion_matrix(model, X_train, y_train, folds=5):
    """Get cross validated confusion matrix for two class problem"""
    # k-fold
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=88)
    kf.get_n_splits(X_train.values)
    
    overall_precision = []
    overall_recall = []
    overall_f1 = []
    
    for train_index, test_index in kf.split(X_train.values, y_train):
        # print("TRAIN:", train_index, "TEST:", test_index) # for debugging
        X_tr, X_te = X_train.values[train_index], X_train.values[test_index]
        y_tr, y_te = y_train[train_index], y_train[test_index]
        
        # fit model
        model.fit(X_tr, y_tr)
        
        # predicted score
        y_predict = model.predict(X_te)
        f1 = f1_score(y_te, y_predict)
        precision = precision_score(y_te, y_predict)
        recall = recall_score(y_te, y_predict)
        
        print(confusion_matrix(y_te, y_predict))
        print('Precision: {} || Recall: {} || F1: {}'.format(precision, recall, f1))
        
        overall_precision.append(precision)
        overall_recall.append(recall)
        overall_f1.append(f1)
    
    print('---------------------------')
    print('Overall Precision: {} (+/- {}) || Overall Recall: {} (+/- {}) || Overall F1: {} (+/- {})'.format(
                                                                            np.mean(overall_precision), np.std(overall_precision, axis=0),
                                                                           np.mean(overall_recall), np.std(overall_recall, axis=0),
                                                                            np.mean(overall_f1), np.std(overall_f1, axis=0)))
    
    return np.mean(overall_f1)

In [34]:
# loop through parameter combination and train model
max_f1 = 0
# best parameter combination
best_parameters = None

for parameter in parameters:
    print(parameter)
    
    xg = xgb.XGBClassifier(random_state=88, learning_rate=parameter[0], 
                          max_depth=parameter[1], min_child_weight=parameter[2], gamma=parameter[3],
                          colsample_bytree=parameter[4])
    f1 = get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot, y_train)
    
    if f1 > max_f1:
        best_parameters=parameter
        max_f1=f1

[0.05, 3, 1, 0, 0.3]
[[11966 10881]
 [ 5738 17109]]
Precision: 0.6112540192926045 || Recall: 0.7488510526546155 || F1: 0.6730924326769872
[[11952 10895]
 [ 5853 16993]]
Precision: 0.6093301778542742 || Recall: 0.7438063555983542 || F1: 0.6698860724563409
[[11761 11086]
 [ 5805 17041]]
Precision: 0.6058591389056778 || Recall: 0.7459073798476757 || F1: 0.6686284895925293
[[11711 11135]
 [ 5851 16996]]
Precision: 0.6041733319114144 || Recall: 0.7439051078916269 || F1: 0.6667974420338185
[[11826 11020]
 [ 5809 17038]]
Precision: 0.6072421412787796 || Recall: 0.7457434236442422 || F1: 0.6694037913760927
---------------------------
Overall Precision: 0.6075717618485501 (+/- 0.0024996839734482104) || Overall Recall: 0.7456426639273029 (+/- 0.0018311261300458584) || Overall F1: 0.6695616456271537 (+/- 0.002054664430689797)
[0.05, 3, 1, 0, 0.5]
[[11910 10937]
 [ 5709 17138]]
Precision: 0.6104363312555654 || Recall: 0.7501203659123736 || F1: 0.6731078904991948
[[11775 11072]
 [ 5707 17139]]
Prec

[[11867 10980]
 [ 5659 17188]]
Precision: 0.6101959670548139 || Recall: 0.7523088370464394 || F1: 0.6738410271488778
[[11775 11072]
 [ 5707 17139]]
Precision: 0.6075289780582043 || Recall: 0.7501969710233739 || F1: 0.6713672953757566
[[11721 11126]
 [ 5767 17079]]
Precision: 0.6055309342315193 || Recall: 0.747570690711722 || F1: 0.6690956102720809
[[11667 11179]
 [ 5850 16997]]
Precision: 0.6032438955139126 || Recall: 0.7439488773143083 || F1: 0.6662485545734277
[[11817 11029]
 [ 5812 17035]]
Precision: 0.6070054161915621 || Recall: 0.7456121153761982 || F1: 0.669207047592858
---------------------------
Overall Precision: 0.6067010382100024 (+/- 0.002294063509137619) || Overall Recall: 0.7479274982944084 (+/- 0.0030220645373605598) || Overall F1: 0.6699519069926001 (+/- 0.0025350131599140226)
[0.05, 3, 3, 0, 0.7]
[[11934 10913]
 [ 5731 17116]]
Precision: 0.6106532519890114 || Recall: 0.7491574386133847 || F1: 0.6728516392798176
[[11188 11659]
 [ 5065 17781]]
Precision: 0.60397418478260

[[11970 10877]
 [ 5754 17093]]
Precision: 0.6111190561315696 || Recall: 0.7481507418917145 || F1: 0.6727276305173465
[[11129 11718]
 [ 5027 17819]]
Precision: 0.6032772454887091 || Recall: 0.7799614812220957 || F1: 0.6803352232594544
[[11672 11175]
 [ 5699 17147]]
Precision: 0.6054304074571005 || Recall: 0.7505471417315942 || F1: 0.6702235772357724
[[10917 11929]
 [ 5069 17778]]
Precision: 0.5984448109873094 || Recall: 0.7781327964284152 || F1: 0.6765612512843932
[[11850 10996]
 [ 5849 16998]]
Precision: 0.6072015431878259 || Recall: 0.7439926467369895 || F1: 0.6686729214610255
---------------------------
Overall Precision: 0.605094612650503 (+/- 0.004204010448569002) || Overall Recall: 0.7601569616021617 (+/- 0.015576473669080628) || Overall F1: 0.6737041207515985 (+/- 0.004254771392113344)
[0.05, 3, 5, 0.2, 0.3]
[[11966 10881]
 [ 5736 17111]]
Precision: 0.6112817947985139 || Recall: 0.7489385914999781 || F1: 0.6731446330572985
[[11951 10896]
 [ 5853 16993]]
Precision: 0.6093083294488

[[12022 10825]
 [ 5340 17507]]
Precision: 0.6179231963857122 || Recall: 0.7662712828817788 || F1: 0.6841477949940405
[[11962 10885]
 [ 5491 17355]]
Precision: 0.6145538243626062 || Recall: 0.7596515801453209 || F1: 0.6794425087108014
[[11747 11100]
 [ 5473 17373]]
Precision: 0.6101569908334211 || Recall: 0.7604394642388165 || F1: 0.6770591788616303
[[11755 11091]
 [ 5490 17357]]
Precision: 0.610130764904387 || Recall: 0.7597058694795815 || F1: 0.6767521200896773
[[11876 10970]
 [ 5386 17461]]
Precision: 0.6141535647708487 || Recall: 0.7642578894384383 || F1: 0.6810328015913257
---------------------------
Overall Precision: 0.613383668251395 (+/- 0.0029516825372037587) || Overall Recall: 0.7620652172367872 (+/- 0.002703110735912113) || Overall F1: 0.6796868808494951 (+/- 0.002730883148172413)
[0.05, 8, 1, 0.2, 0.5]
[[11871 10976]
 [ 5162 17685]]
Precision: 0.6170405777886326 || Recall: 0.7740622401190528 || F1: 0.6866894462996039
[[11786 11061]
 [ 5294 17552]]
Precision: 0.6134274630412

[[11858 10989]
 [ 5169 17678]]
Precision: 0.6166672480552552 || Recall: 0.7737558541602836 || F1: 0.6863376946072912
[[11801 11046]
 [ 5295 17551]]
Precision: 0.6137357065426443 || Recall: 0.7682307624967172 || F1: 0.6823474525202652
[[11592 11255]
 [ 5324 17522]]
Precision: 0.6088890433332175 || Recall: 0.7669613936794187 || F1: 0.6788447010053659
[[11532 11314]
 [ 5298 17549]]
Precision: 0.6080102553442123 || Recall: 0.768109598634394 || F1: 0.6787468574743764
[[11743 11103]
 [ 5266 17581]]
Precision: 0.6129200948263841 || Recall: 0.7695102201601961 || F1: 0.6823465486794357
---------------------------
Overall Precision: 0.6120444696203426 (+/- 0.003200896192203637) || Overall Recall: 0.7693135658262019 (+/- 0.0023633583066303126) || Overall F1: 0.6817246508573469 (+/- 0.0028005776558753586)
[0.05, 8, 3, 0.2, 0.7]
[[11573 11274]
 [ 4948 17899]]
Precision: 0.613546772700785 || Recall: 0.7834288965728542 || F1: 0.6881584006151481
[[11532 11315]
 [ 5103 17743]]
Precision: 0.610606373459

[[11598 11249]
 [ 4966 17881]]
Precision: 0.6138345348438037 || Recall: 0.7826410469645906 || F1: 0.6880350924447353
[[11524 11323]
 [ 5105 17741]]
Precision: 0.6104115056427195 || Recall: 0.7765473168169482 || F1: 0.6835291851281063
[[11555 11292]
 [ 5271 17575]]
Precision: 0.6088266879135344 || Recall: 0.769281274621378 || F1: 0.6797130315394582
[[11311 11535]
 [ 5047 17800]]
Precision: 0.60678370547128 || Recall: 0.779095723727404 || F1: 0.6822275880571844
[[11459 11387]
 [ 5037 17810]]
Precision: 0.6099941774839881 || Recall: 0.7795334179542172 || F1: 0.6844208746445316
---------------------------
Overall Precision: 0.6099701222710652 (+/- 0.00230606923652851) || Overall Recall: 0.7774197560169076 (+/- 0.004506314309837291) || Overall F1: 0.6835851543628032 (+/- 0.002732757995533995)
[0.05, 8, 5, 0.4, 0.3]
[[12025 10822]
 [ 5335 17512]]
Precision: 0.6180560457401002 || Recall: 0.7664901299951854 || F1: 0.6843164455559679
[[11955 10892]
 [ 5496 17350]]
Precision: 0.614333262516819 |

[[12095 10752]
 [ 5379 17468]]
Precision: 0.6189936215450036 || Recall: 0.7645642753972075 || F1: 0.6841208608298902
[[11974 10873]
 [ 5536 17310]]
Precision: 0.6142000496753361 || Recall: 0.7576818699115819 || F1: 0.6784377510827176
[[11777 11070]
 [ 5504 17342]]
Precision: 0.6103758975080952 || Recall: 0.7590825527444629 || F1: 0.6766553513597876
[[11793 11053]
 [ 5513 17334]]
Precision: 0.6106316271532743 || Recall: 0.7586991727579113 || F1: 0.6766600304485302
[[11963 10883]
 [ 5453 17394]]
Precision: 0.6151289033490116 || Recall: 0.7613253381187902 || F1: 0.6804631875440107
---------------------------
Overall Precision: 0.6138660198461442 (+/- 0.003182483303239438) || Overall Recall: 0.7602706417859908 (+/- 0.002454810959578456) || Overall F1: 0.6792674362529871 (+/- 0.0028034079715172897)
[0.05, 12, 1, 0.4, 0.5]
[[11949 10898]
 [ 5263 17584]]
Precision: 0.6173723755354259 || Recall: 0.7696415284282401 || F1: 0.6851487463227416
[[11953 10894]
 [ 5413 17433]]
Precision: 0.6154199173

[[11978 10869]
 [ 5246 17601]]
Precision: 0.6182297154899895 || Recall: 0.7703856086138224 || F1: 0.6859715104156519
[[11916 10931]
 [ 5407 17439]]
Precision: 0.6146986253084243 || Recall: 0.7633283725816336 || F1: 0.6809981255857545
[[11710 11137]
 [ 5389 17457]]
Precision: 0.6105126949709729 || Recall: 0.7641162566751292 || F1: 0.6787325038880249
[[11683 11163]
 [ 5398 17449]]
Precision: 0.6098490143995526 || Recall: 0.7637326563662625 || F1: 0.6781709710643425
[[11841 11005]
 [ 5308 17539]]
Precision: 0.6144548766816144 || Recall: 0.7676719044075808 || F1: 0.6825708781693293
---------------------------
Overall Precision: 0.6135489853701107 (+/- 0.003064649737964612) || Overall Recall: 0.7658469597288857 (+/- 0.0027473132245457803) || Overall F1: 0.6812887978246206 (+/- 0.002825296443501831)
[0.05, 12, 3, 0.4, 0.7]
[[11960 10887]
 [ 5240 17607]]
Precision: 0.6179195620130554 || Recall: 0.7706482251499103 || F1: 0.6858845756802556
[[11865 10982]
 [ 5369 17477]]
Precision: 0.6141115288

[[11946 10901]
 [ 5252 17595]]
Precision: 0.6174550814149354 || Recall: 0.7701229920777345 || F1: 0.6853904134935629
[[11902 10945]
 [ 5405 17441]]
Precision: 0.6144226026914676 || Recall: 0.7634159152586886 || F1: 0.680863522798251
[[11637 11210]
 [ 5321 17525]]
Precision: 0.6098834174351836 || Recall: 0.7670927076950013 || F1: 0.679513774451833
[[11649 11197]
 [ 5361 17486]]
Precision: 0.6096293972039187 || Recall: 0.7653521250054712 || F1: 0.6786726178924898
[[11809 11037]
 [ 5310 17537]]
Precision: 0.6137397634212921 || Recall: 0.7675843655622182 || F1: 0.6820948639660839
---------------------------
Overall Precision: 0.6130260524333595 (+/- 0.0029492187167724482) || Overall Recall: 0.7667136211198227 (+/- 0.002247409551463559) || Overall F1: 0.681307038520444 (+/- 0.0023514801451207526)
[0.15, 3, 1, 0, 0.3]
[[11712 11135]
 [ 5096 17751]]
Precision: 0.6145191442221145 || Recall: 0.7769510220160196 || F1: 0.6862544217424081
[[11598 11249]
 [ 5248 17598]]
Precision: 0.610046105314244

[[11705 11142]
 [ 5089 17758]]
Precision: 0.6144636678200692 || Recall: 0.7772574079747888 || F1: 0.686339304693992
[[11550 11297]
 [ 5211 17635]]
Precision: 0.6095326973593254 || Recall: 0.7719075549330299 || F1: 0.6811773340028584
[[11517 11330]
 [ 5285 17561]]
Precision: 0.6078363504205462 || Recall: 0.7686684758819925 || F1: 0.6788565243442798
[[11518 11328]
 [ 5373 17474]]
Precision: 0.606693979584751 || Recall: 0.7648268919332954 || F1: 0.6766442719123313
[[11690 11156]
 [ 5322 17525]]
Precision: 0.6110316934555978 || Recall: 0.7670591324900424 || F1: 0.6802126998913213
---------------------------
Overall Precision: 0.6099116777280579 (+/- 0.0027121666586821364) || Overall Recall: 0.7699438926426299 (+/- 0.0043237877240924535) || Overall F1: 0.6806460269689565 (+/- 0.0032279973343179924)
[0.15, 3, 3, 0, 0.5]
[[11637 11210]
 [ 5027 17820]]
Precision: 0.6138477437133999 || Recall: 0.7799711121810303 || F1: 0.6870096574589896
[[11478 11369]
 [ 5157 17689]]
Precision: 0.6087480211989

[[11607 11240]
 [ 5001 17846]]
Precision: 0.6135597882142612 || Recall: 0.7811091171707445 || F1: 0.6872701365220573
[[11481 11366]
 [ 5161 17685]]
Precision: 0.6087570135279337 || Recall: 0.7740961218594065 || F1: 0.6815422856812533
[[11361 11486]
 [ 5094 17752]]
Precision: 0.6071550721663589 || Recall: 0.7770288015407512 || F1: 0.6816680746486445
[[11284 11562]
 [ 5107 17740]]
Precision: 0.6054194252952017 || Recall: 0.7764695583665251 || F1: 0.6803582043759229
[[11444 11402]
 [ 5097 17750]]
Precision: 0.608877607025247 || Recall: 0.7769072525933383 || F1: 0.6827054366430123
---------------------------
Overall Precision: 0.6087537812458005 (+/- 0.002712447108357275) || Overall Recall: 0.777122170306153 (+/- 0.002259801946919781) || Overall F1: 0.6827088275741782 (+/- 0.0023989567264353713)
[0.15, 3, 5, 0, 0.7]
[[11725 11122]
 [ 5110 17737]]
Precision: 0.6146089608094528 || Recall: 0.7763382500984812 || F1: 0.686071248984644
[[11622 11225]
 [ 5265 17581]]
Precision: 0.610324238005971 

[[11765 11082]
 [ 5060 17787]]
Precision: 0.6161280266029305 || Recall: 0.778526721232547 || F1: 0.6878722252301028
[[11658 11189]
 [ 5204 17642]]
Precision: 0.6119107904685929 || Recall: 0.7722139543027225 || F1: 0.6827795731176345
[[11669 11178]
 [ 5311 17535]]
Precision: 0.6106989865217846 || Recall: 0.7675304210802766 || F1: 0.6801916251284935
[[11435 11411]
 [ 5131 17716]]
Precision: 0.6082329110447351 || Recall: 0.7754190922221736 || F1: 0.6817254781236772
[[11672 11174]
 [ 5121 17726]]
Precision: 0.6133564013840831 || Recall: 0.7758567864489867 || F1: 0.6851025180203684
---------------------------
Overall Precision: 0.6120654232044253 (+/- 0.0026363073029800374) || Overall Recall: 0.7739093950573412 (+/- 0.0037671487393647687) || Overall F1: 0.6835342839240554 (+/- 0.0026941353939217115)
[0.15, 8, 1, 0.2, 0.3]
[[12026 10821]
 [ 5300 17547]]
Precision: 0.6185490693739425 || Recall: 0.7680220597890314 || F1: 0.6852289368349116
[[11915 10932]
 [ 5441 17405]]
Precision: 0.6142146310

[[12019 10828]
 [ 5300 17547]]
Precision: 0.6183964757709252 || Recall: 0.7680220597890314 || F1: 0.6851352934286049
[[11945 10902]
 [ 5429 17417]]
Precision: 0.6150287792648046 || Recall: 0.7623654031340278 || F1: 0.680816964721978
[[11740 11107]
 [ 5404 17442]]
Precision: 0.6109495954324144 || Recall: 0.7634596865972162 || F1: 0.6787430683918668
[[11748 11098]
 [ 5424 17423]]
Precision: 0.6108832088636443 || Recall: 0.7625946513765484 || F1: 0.6783600685251518
[[11921 10925]
 [ 5344 17503]]
Precision: 0.6156957928802589 || Recall: 0.7660962051910535 || F1: 0.6827108727450023
---------------------------
Overall Precision: 0.6141907704424094 (+/- 0.0029017404321505756) || Overall Recall: 0.7645076012175754 (+/- 0.002200981436088626) || Overall F1: 0.6811532535625207 (+/- 0.002530090291148254)
[0.15, 8, 3, 0.2, 0.5]
[[11995 10852]
 [ 5237 17610]]
Precision: 0.618719696437355 || Recall: 0.7707795334179542 || F1: 0.6864292814126176
[[11860 10987]
 [ 5371 17475]]
Precision: 0.6139765301103

[[11966 10881]
 [ 5229 17618]]
Precision: 0.61819712972385 || Recall: 0.7711296887994047 || F1: 0.6862462509250965
[[11834 11013]
 [ 5334 17512]]
Precision: 0.6139176161262051 || Recall: 0.7665236802941434 || F1: 0.6817854431488584
[[11593 11254]
 [ 5295 17551]]
Precision: 0.6093039402881444 || Recall: 0.7682307624967172 || F1: 0.6795996205300961
[[11655 11191]
 [ 5356 17491]]
Precision: 0.6098249773377031 || Recall: 0.7655709721188777 || F1: 0.6788798540627607
[[11815 11031]
 [ 5301 17546]]
Precision: 0.6139902718969801 || Recall: 0.76797829036635 || F1: 0.6824051026757934
---------------------------
Overall Precision: 0.6130467870745765 (+/- 0.0032423292954143015) || Overall Recall: 0.7678866788150985 (+/- 0.0018901937723755302) || Overall F1: 0.681783254268521 (+/- 0.002588552817611585)
[0.15, 8, 5, 0.2, 0.7]
[[11734 11113]
 [ 5066 17781]]
Precision: 0.6153872776354953 || Recall: 0.778264104696459 || F1: 0.6873079376123383
[[11663 11184]
 [ 5197 17649]]
Precision: 0.6121111226719385

[[11803 11044]
 [ 5134 17713]]
Precision: 0.6159543763257641 || Recall: 0.7752877839541297 || F1: 0.6864971707619564
[[11865 10982]
 [ 5410 17436]]
Precision: 0.6135547892181012 || Recall: 0.7631970585660509 || F1: 0.6802434456928839
[[11680 11167]
 [ 5359 17487]]
Precision: 0.6102812870803378 || Recall: 0.7654293968309551 || F1: 0.6791067961165048
[[11662 11184]
 [ 5399 17448]]
Precision: 0.6093880972338642 || Recall: 0.7636888869435812 || F1: 0.6778686454670836
[[11941 10905]
 [ 5417 17430]]
Precision: 0.6151402858655374 || Recall: 0.7629010373353176 || F1: 0.681098823805244
---------------------------
Overall Precision: 0.6128637671447209 (+/- 0.0026062002499518695) || Overall Recall: 0.7661008327260068 (+/- 0.0046763014530874) || Overall F1: 0.6809629763687345 (+/- 0.002972668005652294)
[0.15, 12, 1, 0.4, 0.3]
[[11995 10852]
 [ 5317 17530]]
Precision: 0.6176449862588965 || Recall: 0.767277979603449 || F1: 0.6843779890296512
[[11939 10908]
 [ 5478 17368]]
Precision: 0.61423115009195

[[11999 10848]
 [ 5316 17531]]
Precision: 0.6177455160506008 || Recall: 0.7673217490261304 || F1: 0.684457111623004
[[11943 10904]
 [ 5477 17369]]
Precision: 0.6143316945495703 || Recall: 0.7602643788847063 || F1: 0.6795516344216436
[[11742 11105]
 [ 5401 17445]]
Precision: 0.6110332749562172 || Recall: 0.7635910006127987 || F1: 0.6788466028484706
[[11755 11091]
 [ 5475 17372]]
Precision: 0.610336225977585 || Recall: 0.7603624108198013 || F1: 0.6771389592671994
[[11955 10891]
 [ 5404 17443]]
Precision: 0.615620808922143 || Recall: 0.7634700398301746 || F1: 0.6816201324710341
---------------------------
Overall Precision: 0.6138135040912231 (+/- 0.002786278064888722) || Overall Recall: 0.7630019158347222 (+/- 0.002595697017333357) || Overall F1: 0.6803228881262704 (+/- 0.0025173915185739545)
[0.15, 12, 3, 0.4, 0.5]
[[12007 10840]
 [ 5282 17565]]
Precision: 0.6183770462946664 || Recall: 0.7688099093972951 || F1: 0.6854366658862093
[[11896 10951]
 [ 5374 17472]]
Precision: 0.6147134363015

[[11982 10865]
 [ 5271 17576]]
Precision: 0.6179810836468479 || Recall: 0.7692913730467895 || F1: 0.6853844953985337
[[11897 10950]
 [ 5400 17446]]
Precision: 0.6143823073672349 || Recall: 0.7636347719513262 || F1: 0.6809258030521838
[[11738 11109]
 [ 5387 17459]]
Precision: 0.6111383366003921 || Recall: 0.7642037993521842 || F1: 0.6791535379468627
[[11749 11097]
 [ 5475 17372]]
Precision: 0.6102075942252977 || Recall: 0.7603624108198013 || F1: 0.6770597864213891
[[11939 10907]
 [ 5349 17498]]
Precision: 0.6160183066361556 || Recall: 0.765877358077647 || F1: 0.6828221337703895
---------------------------
Overall Precision: 0.6139455256951856 (+/- 0.002919772913327942) || Overall Recall: 0.7646739426495497 (+/- 0.002920927569967276) || Overall F1: 0.6810691513178717 (+/- 0.0028795927558927944)
[0.15, 12, 5, 0.4, 0.7]
[[12003 10844]
 [ 5324 17523]]
Precision: 0.6177248210949342 || Recall: 0.7669715936446798 || F1: 0.6843050728316475
[[11881 10966]
 [ 5379 17467]]
Precision: 0.61432138712

[[11697 11150]
 [ 5063 17784]]
Precision: 0.6146402156632336 || Recall: 0.778395412964503 || F1: 0.686892875765242
[[11607 11240]
 [ 5233 17613]]
Precision: 0.6104391224482723 || Recall: 0.7709445854854241 || F1: 0.681367144432194
[[11552 11295]
 [ 5250 17596]]
Precision: 0.6090478003530512 || Recall: 0.7702004727304561 || F1: 0.6802095212323869
[[11274 11572]
 [ 5061 17786]]
Precision: 0.6058314599087131 || Recall: 0.7784829518098656 || F1: 0.6813906713916291
[[11512 11334]
 [ 5054 17793]]
Precision: 0.610876506334329 || Recall: 0.7787893377686348 || F1: 0.6846884980952014
---------------------------
Overall Precision: 0.6101670209415199 (+/- 0.002850846329278317) || Overall Recall: 0.7753625521517767 (+/- 0.003920293147623064) || Overall F1: 0.6829097421833306 (+/- 0.0024902892523276723)
[0.25, 3, 3, 0, 0.3]
[[11719 11128]
 [ 5107 17740]]
Precision: 0.6145212692254399 || Recall: 0.7764695583665251 || F1: 0.6860678719907183
[[11656 11191]
 [ 5265 17581]]
Precision: 0.6110454608647296 

[[11732 11115]
 [ 5140 17707]]
Precision: 0.6143570883352995 || Recall: 0.7750251674180417 || F1: 0.6854013044572181
[[11654 11193]
 [ 5250 17596]]
Precision: 0.6112056688318455 || Recall: 0.7702004727304561 || F1: 0.681553210031955
[[11638 11209]
 [ 5396 17450]]
Precision: 0.6088837712411459 || Recall: 0.7638098573054364 || F1: 0.6776041161052325
[[11382 11464]
 [ 5145 17702]]
Precision: 0.6069395871905644 || Recall: 0.7748063203046351 || F1: 0.6806759848499414
[[11587 11259]
 [ 5167 17680]]
Precision: 0.6109402536369605 || Recall: 0.7738433930056463 || F1: 0.6828100258757194
---------------------------
Overall Precision: 0.6104652738471632 (+/- 0.0024840139869768876) || Overall Recall: 0.7715370421528431 (+/- 0.004234861396612167) || Overall F1: 0.6816089282640133 (+/- 0.0025587794366062518)
[0.25, 3, 5, 0, 0.5]
[[11720 11127]
 [ 5094 17753]]
Precision: 0.6147160664819945 || Recall: 0.7770385608613822 || F1: 0.6864113519051945
[[11626 11221]
 [ 5229 17617]]
Precision: 0.6108953464179

[[11783 11064]
 [ 5065 17782]]
Precision: 0.6164459543784233 || Recall: 0.7783078741191404 || F1: 0.6879848335364556
[[11662 11185]
 [ 5195 17651]]
Precision: 0.6121167984463864 || Recall: 0.7726078963494704 || F1: 0.6830618010138927
[[11659 11188]
 [ 5315 17531]]
Precision: 0.610432118109962 || Recall: 0.7673553357261665 || F1: 0.6799573354019198
[[11647 11199]
 [ 5331 17516]]
Precision: 0.6099947762493471 || Recall: 0.7666652076859106 || F1: 0.6794150731158605
[[11872 10974]
 [ 5343 17504]]
Precision: 0.6146499051899712 || Recall: 0.7661399746137348 || F1: 0.6820847540185095
---------------------------
Overall Precision: 0.612727910474818 (+/- 0.0024751588774249984) || Overall Recall: 0.7702152576988845 (+/- 0.004660581535526013) || Overall F1: 0.6825007594173276 (+/- 0.0030513394692241683)
[0.25, 8, 1, 0, 0.7]
[[11795 11052]
 [ 5058 17789]]
Precision: 0.6167955341354322 || Recall: 0.7786142600779096 || F1: 0.6883222411391426
[[11674 11173]
 [ 5263 17583]]
Precision: 0.61145500069550

[[11957 10890]
 [ 5251 17596]]
Precision: 0.6177069437618479 || Recall: 0.7701667615004159 || F1: 0.6855628932655408
[[11658 11189]
 [ 5213 17633]]
Precision: 0.611789605162723 || Recall: 0.7718200122559747 || F1: 0.682550127738639
[[11654 11193]
 [ 5337 17509]]
Precision: 0.610027175806564 || Recall: 0.7663923662785608 || F1: 0.679328004966245
[[11638 11208]
 [ 5332 17515]]
Precision: 0.6097900637120078 || Recall: 0.7666214382632293 || F1: 0.6792708939305798
[[11703 11143]
 [ 5165 17682]]
Precision: 0.6134258456201214 || Recall: 0.7739309318510089 || F1: 0.6843938690199721
---------------------------
Overall Precision: 0.6125479268126528 (+/- 0.002895556718696366) || Overall Recall: 0.7697863020298378 (+/- 0.0029323657886483182) || Overall F1: 0.6822211577841955 (+/- 0.0025717885374285524)
[0.25, 8, 3, 0.2, 0.3]
[[12041 10806]
 [ 5306 17541]]
Precision: 0.6187956397502381 || Recall: 0.7677594432529435 || F1: 0.6852756182365121
[[11929 10918]
 [ 5451 17395]]
Precision: 0.61438208596757

[[11986 10861]
 [ 5268 17579]]
Precision: 0.6181082981715893 || Recall: 0.7694226813148335 || F1: 0.6855148478171856
[[11925 10922]
 [ 5430 17416]]
Precision: 0.6145811278142423 || Recall: 0.7623216317955003 || F1: 0.6805251641137856
[[11712 11135]
 [ 5369 17477]]
Precision: 0.6108276247728226 || Recall: 0.7649916834456798 || F1: 0.679272416339539
[[11789 11057]
 [ 5486 17361]]
Precision: 0.6109156168625519 || Recall: 0.7598809471703069 || F1: 0.6773042036477128
[[11956 10890]
 [ 5408 17439]]
Precision: 0.6155882664407497 || Recall: 0.7632949621394494 || F1: 0.6815304048772862
---------------------------
Overall Precision: 0.6140041868123911 (+/- 0.0028041294536299425) || Overall Recall: 0.7639823811731541 (+/- 0.003183330049236838) || Overall F1: 0.6808294073591019 (+/- 0.002734541862940108)
[0.25, 8, 5, 0.2, 0.5]
[[11958 10889]
 [ 5213 17634]]
Precision: 0.6182379132629807 || Recall: 0.7718299995623058 || F1: 0.6865485692038154
[[11672 11175]
 [ 5206 17640]]
Precision: 0.612181155648

[[11969 10878]
 [ 5320 17527]]
Precision: 0.617039253652526 || Recall: 0.767146671335405 || F1: 0.683953796924998
[[11857 10990]
 [ 5426 17420]]
Precision: 0.6131643787398803 || Recall: 0.7624967171496104 || F1: 0.6797253004526299
[[11738 11109]
 [ 5423 17423]]
Precision: 0.6106476938174681 || Recall: 0.7626280311651931 || F1: 0.678228035345868
[[11728 11118]
 [ 5463 17384]]
Precision: 0.6099221107290717 || Recall: 0.760887643891977 || F1: 0.6770920563204736
[[11928 10918]
 [ 5422 17425]]
Precision: 0.6147902480330241 || Recall: 0.762682190221911 || F1: 0.6807970306700528
---------------------------
Overall Precision: 0.6131127369943941 (+/- 0.002626433972147955) || Overall Recall: 0.7631682507528192 (+/- 0.0020980013064087082) || Overall F1: 0.6799592439428045 (+/- 0.002363510852619635)
[0.25, 12, 1, 0.2, 0.7]
[[11823 11024]
 [ 5169 17678]]
Precision: 0.6159152672287646 || Recall: 0.7737558541602836 || F1: 0.6858716948922385
[[11882 10965]
 [ 5438 17408]]
Precision: 0.6135410425404434

[[11819 11028]
 [ 5213 17634]]
Precision: 0.6152396901821227 || Recall: 0.7718299995623058 || F1: 0.6846958783901842
[[11860 10987]
 [ 5382 17464]]
Precision: 0.613827281993603 || Recall: 0.7644226560448218 || F1: 0.6808975183733941
[[11674 11173]
 [ 5437 17409]]
Precision: 0.6090896368343712 || Recall: 0.7620152324258076 || F1: 0.6770241891576573
[[11663 11183]
 [ 5426 17421]]
Precision: 0.6090406936092854 || Recall: 0.7625071125311857 || F1: 0.6771880041204252
[[11936 10910]
 [ 5393 17454]]
Precision: 0.6153574954167255 || Recall: 0.7639515034796691 || F1: 0.6816504266661461
---------------------------
Overall Precision: 0.6125109596072215 (+/- 0.0028645989912214065) || Overall Recall: 0.764945300808758 (+/- 0.0035549945263905276) || Overall F1: 0.6802912033415615 (+/- 0.002895496341968759)
[0.25, 12, 3, 0.4, 0.3]
[[12026 10821]
 [ 5359 17488]]
Precision: 0.6177540711434526 || Recall: 0.7654396638508338 || F1: 0.6837125654859645
[[11948 10899]
 [ 5476 17370]]
Precision: 0.61445399554

[[12046 10801]
 [ 5328 17519]]
Precision: 0.6186087570621469 || Recall: 0.7667965159539546 || F1: 0.6847772978677663
[[11945 10902]
 [ 5479 17367]]
Precision: 0.6143478722275284 || Recall: 0.7601768362076512 || F1: 0.6795265577619095
[[11800 11047]
 [ 5506 17340]]
Precision: 0.6108429915101984 || Recall: 0.7589950100674079 || F1: 0.676907461987391
[[11733 11113]
 [ 5446 17401]]
Precision: 0.6102616258679947 || Recall: 0.7616317240775594 || F1: 0.6775958412024689
[[11982 10864]
 [ 5475 17372]]
Precision: 0.6152429522595269 || Recall: 0.7603624108198013 || F1: 0.6801479944404205
---------------------------
Overall Precision: 0.613860839785479 (+/- 0.0030578392041778683) || Overall Recall: 0.7615924994252747 (+/- 0.002733038524843711) || Overall F1: 0.6797910306519912 (+/- 0.0027638398928197627)
[0.25, 12, 5, 0.4, 0.5]
[[12034 10813]
 [ 5329 17518]]
Precision: 0.6183332745049592 || Recall: 0.7667527465312732 || F1: 0.684591035210442
[[11902 10945]
 [ 5430 17416]]
Precision: 0.614082719227

In [35]:
max_f1

0.6837951685957122

In [36]:
best_parameters

[0.05, 8, 3, 0.4, 0.7]

In [55]:
# model with the best parameters
xg = xgb.XGBClassifier(random_state=88, learning_rate=0.05,
                       max_depth=8, min_child_weight=3, gamma=0.4,
                          colsample_bytree=0.7)

util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot, y_train)

[[11593 11254]
 [ 4945 17902]]
Positive
Precision: 0.6140074084236521 || Recall: 0.7835602048408982 || F1: 0.6884987404572813
Negative
Precision: 0.700991655581086 || Recall: 0.5074189171444828 || F1: 0.5887012822140408
[[11536 11311]
 [ 5097 17749]]
Positive
Precision: 0.6107708189951824 || Recall: 0.7768974875251685 || F1: 0.6838901090432704
Negative
Precision: 0.6935609932062767 || Recall: 0.5049240600516479 || F1: 0.5843971631205674
[[11565 11282]
 [ 5275 17571]]
Positive
Precision: 0.6089834679236128 || Recall: 0.7691061892672678 || F1: 0.6797423547844254
Negative
Precision: 0.6867577197149644 || Recall: 0.5061933733094061 || F1: 0.5828104921006879
[[11299 11547]
 [ 5043 17804]]
Positive
Precision: 0.6065892133147082 || Recall: 0.7792708014181293 || F1: 0.682171730717652
Negative
Precision: 0.6914086403133032 || Recall: 0.49457235402258604 || F1: 0.5766561192201695
[[11482 11364]
 [ 5039 17808]]
Positive
Precision: 0.6104483751542575 || Recall: 0.7794458791088545 || F1: 0.68467290

In [64]:
# test model on testing set
xg = xgb.XGBClassifier(random_state=88, learning_rate=0.05,
                       max_depth=8, min_child_weight=3, gamma=0.4,
                          colsample_bytree=0.7)
xg.fit(X_train_one_hot, y_train)
y_predict = xg.predict(X_test_one_hot)

In [65]:
confusion_matrix(y_test, y_predict)

array([[173353, 168755],
       [  1333,   4680]])

In [66]:
f1_score(y_test, y_predict)

0.052159957202086396

In [67]:
precision_score(y_test, y_predict)

0.026984172744832356

In [68]:
recall_score(y_test,y_predict)

0.7783136537502079

# 7. Support Vector Machines

## 7.1 Only count columns

In [61]:
# fit SVM
svm = SVC(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(svm, X_train_one_hot, y_train)

[[20784  2063]
 [17091  5756]]
Precision: 0.7361555186085177 || Recall: 0.2519367969536482 || F1: 0.37539946520576534
[[21334  1513]
 [18091  4755]]
Precision: 0.7586151882578175 || Recall: 0.2081327146984155 || F1: 0.3266469739644158
[[21264  1583]
 [18053  4793]]
Precision: 0.7517252195734002 || Recall: 0.2097960255624617 || F1: 0.32804051741838347
[[21268  1578]
 [18007  4840]]
Precision: 0.7541290121533188 || Recall: 0.2118440057775638 || F1: 0.33077054501964803
[[21254  1592]
 [18086  4761]]
Precision: 0.7494097276877066 || Recall: 0.20838622138573992 || F1: 0.3260958904109589
---------------------------
Overall Precision: 0.7500069332561521 (+/- 0.007566563800139268) || Overall Recall: 0.2180191528755658 (+/- 0.017009961041785204) || Overall F1: 0.33739067840383435 (+/- 0.019073117057882782)


## 7.2 Only count columns with frequencies

In [62]:
# fit SVM
svm = SVC(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(svm, X_train_one_hot_more, y_train)

[[21638  1209]
 [19519  3328]]
Precision: 0.7335243553008596 || Recall: 0.14566463868341575 || F1: 0.24306164183464796
[[21794  1053]
 [19855  2991]]
Precision: 0.7396142433234422 || Recall: 0.13092007353584872 || F1: 0.22246188174042392
[[21713  1134]
 [19744  3102]]
Precision: 0.7322946175637394 || Recall: 0.1357786921124048 || F1: 0.2290820471161657
[[21764  1082]
 [19686  3161]]
Precision: 0.7449917511194909 || Recall: 0.13835514509563618 || F1: 0.23337024732373568
[[21743  1103]
 [19790  3057]]
Precision: 0.7348557692307692 || Recall: 0.13380312513677944 || F1: 0.226385751842115
---------------------------
Overall Precision: 0.7370561473076602 (+/- 0.0046798514302722775) || Overall Recall: 0.136904334912817 (+/- 0.005010862807076712) || Overall F1: 0.23087231397141764 (+/- 0.007055339116657541)


## 7.3 Scaled counts and frequencies

In [None]:
# fit SVM
svm = SVC(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(svm, X_train_one_hot_more_scaled, y_train)

[[17190  5657]
 [13206  9641]]
Precision: 0.6302130997516016 || Recall: 0.42198100407055633 || F1: 0.5054922008126885
[[17232  5615]
 [13043  9803]]
Precision: 0.6358152808405759 || Recall: 0.4290904315853979 || F1: 0.5123876228308593


## 7.4 Scaled counts

In [None]:
# fit SVM
svm = SVC(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(svm, X_train_count_scaled, y_train)