In [43]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import xgboost as xgb

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
from helper import util_visualizations, util_ml

# 0. Load Data

## 0.1 Training

In [2]:
# read data
train = pd.read_csv('/path/to/9_FINAL/data/machine_learning/count_based/train/downsampled_training_data.csv', sep=";")
del train['Unnamed: 0']

In [3]:
# get label
y_train = train['label']

In [4]:
# get variables
X_train = train.copy()
del X_train['label']
X_train.shape

(10824, 65)

In [5]:
X_train

Unnamed: 0,instance,class,frequency,pidspread,pldspread,id,pids,p1,p10,p11,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,a,bacteria,1,1,1,472415879,['p1'],1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,bee,insect,5163,40,2496,214216573,['p7p6p5p4p2p1p43p26p15ap25p24p20cp20bp20dp27a...,401,7,4,...,49,0,1,239,3,2,46,5,28,1
2,dubai,camp,2,1,1,199041901,"['p8a', 'p8a']",0,0,0,...,0,0,0,0,0,0,2,0,0,0
3,gruiformes,species,1,1,1,220552798,['p5'],0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,gabardine,material,17,8,14,411222195,"['p5p10p8ap28bp20a', 'p5', 'p8b', 'p10', 'p3a'...",0,2,0,...,0,0,0,4,0,0,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10819,server,cosmetic,1,1,1,105572888,['p5'],0,0,0,...,0,0,0,1,0,0,0,0,0,0
10820,sturgeon,fish,657,28,413,377939019,['p5p4p2p1p10p8ap8cp23dp25p20bp28ap20dp20ap28b...,59,4,0,...,6,0,1,69,1,0,89,6,26,4
10821,chocolatier,artisan,28,3,18,138575884,"['p1', 'p1', 'p1', 'p1', 'p3a', 'p5', 'p1', 'p...",10,0,0,...,0,0,0,1,0,0,0,0,0,0
10822,ballad,work,159,19,128,490515111,"['p5p2p1p8ap3a', 'p23c', 'p8a', 'p8ap3a', 'p3a...",13,5,0,...,1,0,1,16,0,0,25,1,0,0


In [7]:
# only take one hot encoded columns
X_train_one_hot = X_train.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_train_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,401,7,4,0,0,0,0,1,5,1,...,49,0,1,239,3,2,46,5,28,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,2,0,0,0,0,0,0,0,0,...,0,0,0,4,0,0,3,1,1,0


In [8]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_train_one_hot_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot))
X_train_one_hot_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.000447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.179258,0.010234,0.285714,0.0,0.0,0.0,0.0,0.014925,0.02551,0.1,...,0.027857,0.0,0.008621,0.047591,0.044118,0.083333,0.003794,0.00336,0.082353,0.016393
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000165,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000199,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.002924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000796,0.0,0.0,0.000247,0.000672,0.002941,0.0


In [9]:
# only take one hot encoded columns
X_train_one_hot_more = X_train.drop(['instance', 'class', 'id', 'pids'], axis=1) 
X_train_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5163,40,2496,401,7,4,0,0,0,0,...,49,0,1,239,3,2,46,5,28,1
2,2,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,17,8,14,0,2,0,0,0,0,0,...,0,0,0,4,0,0,3,1,1,0


In [10]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_train_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot_more))
X_train_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.0,0.0,0.0,0.000447,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.139994,0.78,0.179872,0.179258,0.010234,0.285714,0.0,0.0,0.0,0.0,...,0.027857,0.0,0.008621,0.047591,0.044118,0.083333,0.003794,0.00336,0.082353,0.016393
2,2.7e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000165,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000199,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000434,0.14,0.000937,0.0,0.002924,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000796,0.0,0.0,0.000247,0.000672,0.002941,0.0


## 0.2 Testing

In [11]:
# read data
test = pd.read_csv('/path/to/9_FINAL/data/machine_learning/count_based/test/test_data.csv', sep=";")
del test['Unnamed: 0']

In [12]:
# get label
y_test = test['label']

In [13]:
# get variables
X_test = test.copy()
del X_test['label']
X_test.shape

(348121, 65)

In [14]:
# only take one hot encoded columns
X_test_one_hot = X_test.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_test_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,1,8,0,0,0,0,0,0,0,0,...,0,0,1,29,0,0,25,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [15]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot))
X_test_one_hot_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000303,0.0,0.001439,0.0
3,0.000278,0.011628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.008264,0.009979,0.0,0.0,0.007585,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000344,0.0,0.0,0.000303,0.0,0.0,0.0


In [16]:
# only take one hot encoded columns
X_test_one_hot_more = X_test.drop(['instance', 'class',  'id', 'pids'], axis=1)
X_test_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,3,2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,134,10,93,1,8,0,0,0,0,0,...,0,0,1,29,0,0,25,0,0,0
4,3,2,3,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [17]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_more))
X_test_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,3.1e-05,0.020833,0.000132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.5e-05,0.020833,0.000132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000303,0.0,0.001439,0.0
3,0.002055,0.1875,0.012168,0.000278,0.011628,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.008264,0.009979,0.0,0.0,0.007585,0.0,0.0,0.0
4,3.1e-05,0.020833,0.000265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000344,0.0,0.0,0.000303,0.0,0.0,0.0


# 1. Naive Bayes

## 1.1 Only count columns

In [18]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot, y_train)

[[380 135 207]
 [319 261 141]
 [304 106 312]]
Macro: 0.4396395146196956 || Micro: 0.4401847575057737
[[324 178 219]
 [159 416 147]
 [118 305 299]]
Macro: 0.47819206373839496 || Micro: 0.47990762124711317
[[336 199 186]
 [148 456 118]
 [134 333 255]]
Macro: 0.47777562066914286 || Micro: 0.4836027713625866
[[297 170 255]
 [136 228 358]
 [123 107 491]]
Macro: 0.458169688661493 || Micro: 0.469284064665127
[[361 154 207]
 [323 209 189]
 [254  96 371]]
Macro: 0.4292776537167442 || Micro: 0.43484288354898337
---------------------------
Overall Macro: 0.45661090828109413 (+/- 0.01975477727287777) || Overall Micro: 0.46156441966591677 (+/- 0.020262701474377885)


## 1.2 Count columns with frequencies

In [19]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_more, y_train)

[[132 130 460]
 [ 12  98 611]
 [  1  28 693]]
Macro: 0.3542115062206855 || Micro: 0.42632794457274825
[[131 139 451]
 [  7  91 624]
 [  2  41 679]]
Macro: 0.345348525370867 || Micro: 0.4161662817551963
[[129 185 407]
 [ 15 195 512]
 [  4 116 602]]
Macro: 0.3846237071725633 || Micro: 0.4277136258660509
[[124 209 389]
 [  6 170 546]
 [  5  74 642]]
Macro: 0.3791633340046734 || Micro: 0.4323325635103926
[[144 133 445]
 [ 10  90 621]
 [  5  27 689]]
Macro: 0.356273320233157 || Micro: 0.4265249537892791
---------------------------
Overall Macro: 0.3639240786003892 (+/- 0.015222541653157264) || Overall Micro: 0.42581307389873346 (+/- 0.0052905256076715)


## 1.3 Scaled count pids

In [20]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_scaled, y_train)

[[348 229 145]
 [ 91 568  62]
 [ 57 485 180]]
Macro: 0.487731539793663 || Micro: 0.5062355658198614
[[521  69 131]
 [595  63  64]
 [537  34 151]]
Macro: 0.2878616928979944 || Micro: 0.3394919168591224
[[506  88 127]
 [581  85  56]
 [531  49 142]]
Macro: 0.2946664903578981 || Micro: 0.33856812933025404
[[396  64 262]
 [175  71 476]
 [146  28 547]]
Macro: 0.41873269851475614 || Micro: 0.46836027713625866
[[335 261 126]
 [124 532  65]
 [ 97 459 165]]
Macro: 0.4566478733619039 || Micro: 0.47689463955637706
---------------------------
Overall Macro: 0.38912805898524316 (+/- 0.08286841532299608) || Overall Micro: 0.4259101057403747 (+/- 0.07204213140891669)


## 1.4 Scaled count encoded pids and frequencies

In [21]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_more_scaled, y_train)

[[136 469 117]
 [ 48 635  38]
 [ 64 522 136]]
Macro: 0.3633460226412155 || Micro: 0.41893764434180136
[[229 365 127]
 [392 270  60]
 [428 174 120]]
Macro: 0.28156795346219227 || Micro: 0.2859122401847575
[[242 384  95]
 [380 307  35]
 [402 235  85]]
Macro: 0.2771222694777818 || Micro: 0.2928406466512702
[[164 356 202]
 [ 64 285 373]
 [105 180 436]]
Macro: 0.3945916389865058 || Micro: 0.40877598152424943
[[139 466 117]
 [ 46 617  58]
 [ 72 507 142]]
Macro: 0.363844763288287 || Micro: 0.41497227356746763
---------------------------
Overall Macro: 0.33609452957119645 (+/- 0.0477189817966425) || Overall Micro: 0.3642877572539093 (+/- 0.06128971160106635)


# 2. Decision Trees

## 2.1 Only count columns

In [22]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot, y_train)

[[389 211 122]
 [116 447 158]
 [ 87 335 300]]
Macro: 0.5248338867212008 || Micro: 0.5247113163972287
[[382 187 152]
 [103 424 195]
 [ 71 350 301]]
Macro: 0.5138518071770889 || Micro: 0.5113163972286374
[[382 198 141]
 [116 449 157]
 [ 93 357 272]]
Macro: 0.5078826014673514 || Micro: 0.5094688221709007
[[396 200 126]
 [ 97 478 147]
 [ 97 330 294]]
Macro: 0.537593810595055 || Micro: 0.5394919168591225
[[402 182 138]
 [126 419 176]
 [114 326 281]]
Macro: 0.5083298775376192 || Micro: 0.5092421441774492
---------------------------
Overall Macro: 0.518498396699663 (+/- 0.011335413095285506) || Overall Micro: 0.5188461193666677 (+/- 0.011811149376039233)


## 2.2 Count columns with frequencies

In [24]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_more, y_train)

[[389 192 141]
 [116 408 197]
 [ 83 312 327]]
Macro: 0.5217027379203434 || Micro: 0.5191685912240185
[[380 207 134]
 [109 433 180]
 [ 91 333 298]]
Macro: 0.5139523640350608 || Micro: 0.5131639722863741
[[386 192 143]
 [123 422 177]
 [107 322 293]]
Macro: 0.5084701573908809 || Micro: 0.5085450346420324
[[404 200 118]
 [107 453 162]
 [ 91 342 288]]
Macro: 0.5282648213208261 || Micro: 0.5288683602771362
[[391 177 154]
 [143 391 187]
 [112 305 304]]
Macro: 0.5024407688977396 || Micro: 0.5018484288354899
---------------------------
Overall Macro: 0.5149661699129702 (+/- 0.009190724992329998) || Overall Micro: 0.5143188774530103 (+/- 0.00922377720440237)


## 2.3 Scaled count pids

In [25]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_scaled, y_train)

[[388 211 123]
 [116 447 158]
 [ 88 335 299]]
Macro: 0.5238144972793632 || Micro: 0.5237875288683603
[[382 187 152]
 [103 423 196]
 [ 72 348 302]]
Macro: 0.513871247135116 || Micro: 0.5113163972286374
[[380 200 141]
 [117 448 157]
 [ 91 356 275]]
Macro: 0.5081418835082525 || Micro: 0.5094688221709007
[[395 201 126]
 [ 98 478 146]
 [ 97 332 292]]
Macro: 0.5360844671803148 || Micro: 0.5381062355658198
[[402 181 139]
 [125 420 176]
 [114 326 281]]
Macro: 0.508770474591517 || Micro: 0.5097042513863216
---------------------------
Overall Macro: 0.5181365139389127 (+/- 0.010584671480778246) || Overall Micro: 0.518476647044008 (+/- 0.011161462477883105)


## 2.4 Scaled count encoded pids and frequencies

In [26]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_more_scaled, y_train)

[[389 193 140]
 [116 409 196]
 [ 84 313 325]]
Macro: 0.5211442971287504 || Micro: 0.5187066974595843
[[381 205 135]
 [109 434 179]
 [ 90 332 300]]
Macro: 0.5158348172742903 || Micro: 0.5150115473441108
[[386 192 143]
 [123 421 178]
 [107 322 293]]
Macro: 0.5080607084145925 || Micro: 0.5080831408775982
[[404 199 119]
 [107 453 162]
 [ 91 342 288]]
Macro: 0.5282518530674225 || Micro: 0.5288683602771362
[[389 178 155]
 [143 392 186]
 [111 306 304]]
Macro: 0.501993004743495 || Micro: 0.5013863216266173
---------------------------
Overall Macro: 0.5150569361257101 (+/- 0.00928849182059582) || Overall Micro: 0.5144112135170094 (+/- 0.009354282308491578)


# 3. Random Forest

## 3.1 Only count encoded columns

In [27]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot, y_train)

[[476 140 106]
 [118 447 156]
 [ 94 307 321]]
Macro: 0.5735638998233575 || Micro: 0.5745958429561201
[[474 132 115]
 [120 414 188]
 [ 77 333 312]]
Macro: 0.5549759256081829 || Micro: 0.5542725173210161
[[472 137 112]
 [115 437 170]
 [ 91 329 302]]
Macro: 0.5583649008781859 || Micro: 0.5593533487297921
[[469 145 108]
 [115 450 157]
 [ 86 318 317]]
Macro: 0.570292915137527 || Micro: 0.5709006928406467
[[462 162  98]
 [135 424 162]
 [106 328 287]]
Macro: 0.5399189582983206 || Micro: 0.5420517560073937
---------------------------
Overall Macro: 0.5594233199491148 (+/- 0.011995197284732114) || Overall Micro: 0.5602348315709939 (+/- 0.011721345356174931)


## 3.2 Only count columns with frequencies

In [28]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_more, y_train)

[[476 124 122]
 [125 413 183]
 [ 96 282 344]]
Macro: 0.5696940397820761 || Micro: 0.5695150115473441
[[480 135 106]
 [129 421 172]
 [ 87 320 315]]
Macro: 0.56116124607698 || Micro: 0.561662817551963
[[479 135 107]
 [129 424 169]
 [ 97 309 316]]
Macro: 0.5620389049350221 || Micro: 0.5630484988452656
[[475 147 100]
 [120 447 155]
 [101 316 304]]
Macro: 0.5642506015099975 || Micro: 0.5662817551963049
[[459 137 126]
 [149 386 186]
 [110 296 315]]
Macro: 0.535385917247935 || Micro: 0.5360443622920518
---------------------------
Overall Macro: 0.5585061419104023 (+/- 0.011935180401190782) || Overall Micro: 0.559310489086586 (+/- 0.011946126167163828)


## 3.3 Scaled count pids

In [29]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_scaled, y_train)

[[475 141 106]
 [120 446 155]
 [ 94 305 323]]
Macro: 0.5736427676131253 || Micro: 0.5745958429561201
[[476 132 113]
 [119 415 188]
 [ 77 334 311]]
Macro: 0.5558210386870753 || Micro: 0.5551963048498846
[[472 136 113]
 [117 435 170]
 [ 92 327 303]]
Macro: 0.557887870813486 || Micro: 0.558891454965358
[[470 144 108]
 [117 448 157]
 [ 85 318 318]]
Macro: 0.5703560590279305 || Micro: 0.5709006928406467
[[463 161  98]
 [137 420 164]
 [105 328 288]]
Macro: 0.5391458986157106 || Micro: 0.5411275415896488
---------------------------
Overall Macro: 0.5593707269514656 (+/- 0.012232836921902687) || Overall Micro: 0.5601423674403316 (+/- 0.011935128312716382)


## 3.4 Scaled count pids and frequencies

In [30]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_more_scaled, y_train)

[[473 127 122]
 [124 412 185]
 [ 96 284 342]]
Macro: 0.5670453004406936 || Micro: 0.566743648960739
[[479 135 107]
 [130 419 173]
 [ 86 319 317]]
Macro: 0.5608708979820802 || Micro: 0.5612009237875288
[[475 136 110]
 [132 422 168]
 [ 98 310 314]]
Macro: 0.5583155844424889 || Micro: 0.5593533487297921
[[478 148  96]
 [123 444 155]
 [100 314 307]]
Macro: 0.5657790642348479 || Micro: 0.5676674364896074
[[462 138 122]
 [149 387 185]
 [109 299 313]]
Macro: 0.5361877939710742 || Micro: 0.5369685767097967
---------------------------
Overall Macro: 0.5576397282142369 (+/- 0.011187459964141793) || Overall Micro: 0.5583867869354928 (+/- 0.011167461092402242)


# 4. Neural Network

## 4.1 Only count encoded columns

In [31]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot, y_train)

[[438 160 124]
 [ 76 500 145]
 [ 68 303 351]]
Macro: 0.5962344067005004 || Micro: 0.5953810623556582
[[437 139 145]
 [ 79 447 196]
 [ 63 320 339]]
Macro: 0.5683475353090189 || Micro: 0.5648960739030023
[[444 148 129]
 [105 466 151]
 [ 75 324 323]]
Macro: 0.5697386670903295 || Micro: 0.5695150115473441




[[433 177 112]
 [ 85 487 150]
 [ 66 311 344]]
Macro: 0.5851700497857794 || Micro: 0.5838337182448037
[[417 184 121]
 [ 99 462 160]
 [ 71 328 322]]
Macro: 0.5562217185012462 || Micro: 0.5549907578558225
---------------------------
Overall Macro: 0.575142475477375 (+/- 0.013991348832452084) || Overall Micro: 0.5737233247813261 (+/- 0.014267564436774868)


## 4.2 Only count columns with frequencies

In [32]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_more, y_train)

[[429 182 111]
 [ 72 510 139]
 [ 66 321 335]]
Macro: 0.5888306325751601 || Micro: 0.5884526558891455
[[394 193 134]
 [ 79 459 184]
 [ 74 330 318]]
Macro: 0.5428397621301868 || Micro: 0.540877598152425
[[447 119 155]
 [132 296 294]
 [104 160 458]]
Macro: 0.5518327449877947 || Micro: 0.5547344110854503
[[376 258  88]
 [ 79 514 129]
 [ 50 349 322]]
Macro: 0.5605914216163531 || Micro: 0.5598152424942263
[[451 103 168]
 [133 250 338]
 [ 61 150 510]]
Macro: 0.5518517131217284 || Micro: 0.5596118299445472
---------------------------
Overall Macro: 0.5591892548862446 (+/- 0.015848287057271256) || Overall Micro: 0.5606983475131588 (+/- 0.015496403913589184)


## 4.3 Scaled count pids

In [33]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_scaled, y_train)



[[425 226  71]
 [ 71 602  48]
 [ 62 487 173]]
Macro: 0.5322136596328545 || Micro: 0.5542725173210161




[[412  82 227]
 [ 85 162 475]
 [ 66  86 570]]
Macro: 0.5071481615324793 || Micro: 0.5284064665127021




[[395 233  93]
 [ 76 563  83]
 [ 44 467 211]]
Macro: 0.5289786592667468 || Micro: 0.5399538106235566




[[392 242  88]
 [ 87 563  72]
 [ 57 471 193]]
Macro: 0.5153930357128955 || Micro: 0.5302540415704388
[[397 225 100]
 [ 87 523 111]
 [ 60 416 245]]
Macro: 0.5327973389998674 || Micro: 0.538354898336414
---------------------------
Overall Macro: 0.5233061710289687 (+/- 0.010249894896115028) || Overall Micro: 0.5382483468728255 (+/- 0.009170229273719044)




## 4.4 Scaled count pids and frequencies

In [34]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_more_scaled, y_train)



[[454 196  72]
 [103 528  90]
 [ 87 421 214]]
Macro: 0.5401435097635502 || Micro: 0.5524249422632794




[[451 124 146]
 [108 296 318]
 [ 77 217 428]]
Macro: 0.5435584418462914 || Micro: 0.5427251732101617




[[444  84 193]
 [121 177 424]
 [ 86  96 540]]
Macro: 0.516695231185476 || Micro: 0.5362586605080831




[[405 243  74]
 [ 93 555  74]
 [ 53 470 198]]
Macro: 0.5217383961457419 || Micro: 0.5348729792147806
[[421 101 200]
 [111 219 391]
 [ 68 119 534]]
Macro: 0.5310160620890827 || Micro: 0.5425138632162662
---------------------------
Overall Macro: 0.5306303282060284 (+/- 0.010305716093580521) || Overall Micro: 0.5417591236825142 (+/- 0.006211870092254716)




# 5. Logistic Regression

## 5.1 Only count encoded columns

In [35]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[286 182 254]
 [ 29 288 404]
 [ 12 155 555]]
Macro: 0.5156197507206637 || Micro: 0.5214780600461894


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[295 169 257]
 [ 40 245 437]
 [ 20 171 531]]
Macro: 0.489562014711126 || Micro: 0.49468822170900695


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[294 191 236]
 [ 38 306 378]
 [ 16 228 478]]
Macro: 0.5000009742312096 || Micro: 0.49792147806004616


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[277 184 261]
 [ 36 228 458]
 [ 22  98 601]]
Macro: 0.49439391614698486 || Micro: 0.5108545034642032
[[290 166 266]
 [ 38 199 484]
 [ 19  78 624]]
Macro: 0.49339719967636037 || Micro: 0.5143253234750462
---------------------------
Overall Macro: 0.4985947710972689 (+/- 0.009144178009543438) || Overall Micro: 0.5078535173508983 (+/- 0.010084530426649032)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.2 Only count columns with frequencies

In [36]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_more, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[403  83 236]
 [ 87 225 409]
 [ 45 105 572]]
Macro: 0.5426764892830737 || Micro: 0.5542725173210161


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[383  60 278]
 [ 87 166 469]
 [ 39 117 566]]
Macro: 0.49692222436983685 || Micro: 0.5150115473441108


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[394  85 242]
 [ 81 229 412]
 [ 48 157 517]]
Macro: 0.5211898533562315 || Micro: 0.5265588914549654


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[357 116 249]
 [ 63 219 440]
 [ 30  92 599]]
Macro: 0.5289108789539738 || Micro: 0.5427251732101617
[[412  97 213]
 [110 190 421]
 [ 59 100 562]]
Macro: 0.5205599667288682 || Micro: 0.5378927911275416
---------------------------
Overall Macro: 0.5220518825383967 (+/- 0.01487876918272319) || Overall Micro: 0.5352921840915591 (+/- 0.01348955270461129)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.3 Scaled count pids

In [37]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_scaled, y_train)

[[252 101 369]
 [ 38  47 636]
 [ 30  29 663]]
Macro: 0.3810579987813895 || Micro: 0.4443418013856813
[[251  96 374]
 [ 41  52 629]
 [ 39  25 658]]
Macro: 0.3818774994486538 || Micro: 0.4438799076212471
[[233 138 350]
 [ 35 103 584]
 [ 21  79 622]]
Macro: 0.40172531324977184 || Micro: 0.44249422632794455
[[265  86 371]
 [ 34  58 630]
 [ 35  23 663]]
Macro: 0.3961174905666809 || Micro: 0.45542725173210163
[[257 108 357]
 [ 52  51 618]
 [ 36  38 647]]
Macro: 0.3817063231902785 || Micro: 0.44131238447319776
---------------------------
Overall Macro: 0.3884969250473549 (+/- 0.008698620190544279) || Overall Micro: 0.4454911143080345 (+/- 0.005081079701967805)


## 5.4 Scaled count pids and frequencies

In [38]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_more_scaled, y_train)

[[410 123 189]
 [ 84 113 524]
 [ 77  93 552]]
Macro: 0.4683445459605582 || Micro: 0.49653579676674364
[[405 121 195]
 [ 98 111 513]
 [ 74  75 573]]
Macro: 0.47064073584048033 || Micro: 0.5030023094688222
[[401 103 217]
 [ 92 118 512]
 [ 59 120 543]]
Macro: 0.4655516424791575 || Micro: 0.49053117782909933
[[395 114 213]
 [103 119 500]
 [ 74  95 552]]
Macro: 0.4643559839017377 || Micro: 0.492378752886836
[[394 135 193]
 [107 135 479]
 [ 79 110 532]]
Macro: 0.4677272048705643 || Micro: 0.49029574861367836
---------------------------
Overall Macro: 0.4673240226104996 (+/- 0.0021979624928310368) || Overall Micro: 0.4945487571130359 (+/- 0.00478223414227024)


# 6. XGBoost

## 6.1 Only count encoded columns

In [40]:
# fit xgboost
xgb_class = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xgb_class, X_train_one_hot, y_train)

[[451 156 115]
 [ 87 477 157]
 [ 67 306 349]]
Macro: 0.5913864833233496 || Micro: 0.589838337182448
[[449 144 128]
 [ 91 467 164]
 [ 63 336 323]]
Macro: 0.5736785166567274 || Micro: 0.5722863741339492
[[445 158 118]
 [104 477 141]
 [ 78 338 306]]
Macro: 0.5661979075518636 || Micro: 0.5672055427251732
[[433 170 119]
 [ 95 466 161]
 [ 67 303 351]]
Macro: 0.5792978616637142 || Micro: 0.5773672055427251
[[439 178 105]
 [106 448 167]
 [ 76 319 326]]
Macro: 0.5619630794900085 || Micro: 0.5605360443622921
---------------------------
Overall Macro: 0.5745047697371326 (+/- 0.010343564488090814) || Overall Micro: 0.5734467007893176 (+/- 0.009909990539567409)


## 6.2 Only count columns with frequencies

In [44]:
# fit xgboost
xgb_class = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xgb_class, X_train_one_hot_more, y_train)

[[456 138 128]
 [ 92 449 180]
 [ 72 285 365]]
Macro: 0.5888654873468456 || Micro: 0.5866050808314087
[[451 147 123]
 [101 448 173]
 [ 65 323 334]]
Macro: 0.5715216039575154 || Micro: 0.5695150115473441
[[450 144 127]
 [112 461 149]
 [ 79 316 327]]
Macro: 0.5718488170791762 || Micro: 0.571824480369515
[[455 164 103]
 [ 99 479 144]
 [ 70 322 329]]
Macro: 0.5837579410852752 || Micro: 0.5833718244803695
[[445 158 119]
 [114 431 176]
 [ 83 312 326]]
Macro: 0.5565924529419598 || Micro: 0.555452865064695
---------------------------
Overall Macro: 0.5745172604821545 (+/- 0.011212984132077133) || Overall Micro: 0.5733538524586665 (+/- 0.01107678513850328)


## 6.3 Scaled count pids

In [45]:
# fit xgboost
xgb_class = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xgb_class, X_train_one_hot_scaled, y_train)

[[451 156 115]
 [ 87 477 157]
 [ 67 306 349]]
Macro: 0.5913864833233496 || Micro: 0.589838337182448
[[449 144 128]
 [ 91 467 164]
 [ 63 336 323]]
Macro: 0.5736785166567274 || Micro: 0.5722863741339492
[[445 158 118]
 [104 477 141]
 [ 78 338 306]]
Macro: 0.5661979075518636 || Micro: 0.5672055427251732
[[433 170 119]
 [ 95 466 161]
 [ 67 303 351]]
Macro: 0.5792978616637142 || Micro: 0.5773672055427251
[[439 178 105]
 [106 448 167]
 [ 76 319 326]]
Macro: 0.5619630794900085 || Micro: 0.5605360443622921
---------------------------
Overall Macro: 0.5745047697371326 (+/- 0.010343564488090814) || Overall Micro: 0.5734467007893176 (+/- 0.009909990539567409)


## 6.4 Scaled count pids and frequencies

In [46]:
# fit xgboost
xgb_class = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xgb_class, X_train_one_hot_more_scaled, y_train)

[[456 138 128]
 [ 92 449 180]
 [ 72 285 365]]
Macro: 0.5888654873468456 || Micro: 0.5866050808314087
[[451 147 123]
 [101 448 173]
 [ 65 323 334]]
Macro: 0.5715216039575154 || Micro: 0.5695150115473441
[[450 144 127]
 [112 461 149]
 [ 79 316 327]]
Macro: 0.5718488170791762 || Micro: 0.571824480369515
[[455 164 103]
 [ 99 479 144]
 [ 70 322 329]]
Macro: 0.5837579410852752 || Micro: 0.5833718244803695
[[445 158 119]
 [114 431 176]
 [ 83 312 326]]
Macro: 0.5565924529419598 || Micro: 0.555452865064695
---------------------------
Overall Macro: 0.5745172604821545 (+/- 0.011212984132077133) || Overall Micro: 0.5733538524586665 (+/- 0.01107678513850328)
