In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, KFold # for cross validation
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics # for f1 macro in cross validation

import xgboost as xgb

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
from helper import util_visualizations, util_ml

# 0. Load Data

## 0.1 Training

In [2]:
# read data
train = pd.read_csv('/path/to/9_FINAL/data/machine_learning/one_hot_encoded/train/downsampled_training_data.csv', sep=";")
del train['Unnamed: 0']

In [3]:
# get label
y_train = train['label']

In [4]:
# get variables
X_train = train.copy()
del X_train['label']
X_train.shape

(10824, 65)

In [5]:
X_train

Unnamed: 0,instance,class,frequency,pidspread,pldspread,id,pids,p1,p10,p11,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,a,bacteria,1,1,1,472415879,['p1'],1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,bee,insect,5163,40,2496,214216573,['p7p6p5p4p2p1p43p26p15ap25p24p20cp20bp20dp27a...,1,1,1,...,1,0,1,1,1,1,1,1,1,1
2,dubai,camp,2,1,1,199041901,"['p8a', 'p8a']",0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,gruiformes,species,1,1,1,220552798,['p5'],0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,gabardine,material,17,8,14,411222195,"['p5p10p8ap28bp20a', 'p5', 'p8b', 'p10', 'p3a'...",0,1,0,...,0,0,0,1,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10819,server,cosmetic,1,1,1,105572888,['p5'],0,0,0,...,0,0,0,1,0,0,0,0,0,0
10820,sturgeon,fish,657,28,413,377939019,['p5p4p2p1p10p8ap8cp23dp25p20bp28ap20dp20ap28b...,1,1,0,...,1,0,1,1,1,0,1,1,1,1
10821,chocolatier,artisan,28,3,18,138575884,"['p1', 'p1', 'p1', 'p1', 'p3a', 'p5', 'p1', 'p...",1,0,0,...,0,0,0,1,0,0,0,0,0,0
10822,ballad,work,159,19,128,490515111,"['p5p2p1p8ap3a', 'p23c', 'p8a', 'p8ap3a', 'p3a...",1,1,0,...,1,0,1,1,0,0,1,1,0,0


In [6]:
# only take one hot encoded columns
X_train_one_hot = X_train.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_train_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,1,1,1,...,1,0,1,1,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,1,0


In [7]:
# only take one hot encoded columns
X_train_one_hot_more = X_train.drop(['instance', 'class', 'id', 'pids'], axis=1) 
X_train_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5163,40,2496,1,1,1,0,0,0,0,...,1,0,1,1,1,1,1,1,1,1
2,2,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,17,8,14,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,1,1,0


In [8]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_train_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot_more))
X_train_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.139994,0.78,0.179872,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2.7e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000434,0.14,0.000937,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0


In [9]:
# only take one hot encoded columns
X_train_one_hot_frequency = X_train[['frequency', 'pidspread', 'pldspread']]# TODO: frequency
X_train_one_hot_frequency.head()

Unnamed: 0,frequency,pidspread,pldspread
0,1,1,1
1,5163,40,2496
2,2,1,1
3,1,1,1
4,17,8,14


In [10]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_train_one_hot_frequency_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot_frequency))
X_train_one_hot_frequency_scaled.head()

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.139994,0.78,0.179872
2,2.7e-05,0.0,0.0
3,0.0,0.0,0.0
4,0.000434,0.14,0.000937


## 0.2 Testing

In [11]:
# read test data
test = pd.read_csv('/path/to/9_FINAL/data/machine_learning/one_hot_encoded/test/test_data.csv', sep=";")
del test['Unnamed: 0']

In [12]:
# get label
y_test = test['label']

In [13]:
# get variables
X_test = test.copy()
del X_test['label']
X_test.shape

(348121, 65)

In [14]:
# only take one hot encoded columns
X_test_one_hot = X_test.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_test_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [15]:
# only take one hot encoded columns
X_test_one_hot_more = X_test.drop(['instance', 'class',  'id', 'pids'], axis=1)
X_test_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,3,2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,134,10,93,1,1,0,0,0,0,0,...,0,0,1,1,0,0,1,0,0,0
4,3,2,3,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [16]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_more))
X_test_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,3.1e-05,0.020833,0.000132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.5e-05,0.020833,0.000132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.002055,0.1875,0.012168,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3.1e-05,0.020833,0.000265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [17]:
# only take one hot encoded columns
X_test_one_hot_frequency = X_test[['frequency', 'pidspread', 'pldspread']]
X_test_one_hot_frequency.head()

Unnamed: 0,frequency,pidspread,pldspread
0,3,2,2
1,1,1,1
2,2,2,2
3,134,10,93
4,3,2,3


In [18]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_frequency_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_frequency))
X_test_one_hot_frequency_scaled.head()

Unnamed: 0,0,1,2
0,3.1e-05,0.020833,0.000132
1,0.0,0.0,0.0
2,1.5e-05,0.020833,0.000132
3,0.002055,0.1875,0.012168
4,3.1e-05,0.020833,0.000265


# 1. Naive Bayes

## 1.1 Only one hot encoded columns

In [19]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot, y_train)

[[324 244 154]
 [ 80 501 140]
 [106 313 303]]
Macro: 0.5162169229734085 || Micro: 0.5210161662817552
[[314 230 177]
 [ 81 483 158]
 [115 344 263]]
Macro: 0.4838802936285343 || Micro: 0.4896073903002309
[[307 260 154]
 [ 79 511 132]
 [106 357 259]]
Macro: 0.48915174291680197 || Micro: 0.497459584295612
[[330 267 125]
 [ 90 490 142]
 [125 304 292]]
Macro: 0.5089336648058641 || Micro: 0.5136258660508083
[[316 257 149]
 [ 93 475 153]
 [123 315 283]]
Macro: 0.49156731245234414 || Micro: 0.49630314232902034
---------------------------
Overall Macro: 0.49794998735539064 (+/- 0.012413106242158127) || Overall Micro: 0.5036024298514853 (+/- 0.011752299958773228)


## 1.2 Only one hot encoded columns with frequencies

In [20]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_more, y_train)

[[119 108 495]
 [ 12  34 675]
 [  2  17 703]]
Macro: 0.29914882523644315 || Micro: 0.39538106235565823
[[115 127 479]
 [  6  46 670]
 [  6   9 707]]
Macro: 0.30716117532604276 || Micro: 0.4009237875288684
[[120 127 474]
 [ 16  74 632]
 [  3  25 694]]
Macro: 0.32851492351232875 || Micro: 0.4101616628175519
[[126 139 457]
 [ 13  62 647]
 [  8  19 694]]
Macro: 0.3242118728412895 || Micro: 0.40739030023094686
[[125 120 477]
 [ 15  52 654]
 [  5  19 697]]
Macro: 0.3164222839818463 || Micro: 0.40388170055452866
---------------------------
Overall Macro: 0.3150918161795901 (+/- 0.01079073132936679) || Overall Micro: 0.40354770269751084 (+/- 0.005141994245958679)


## 1.3 Only with frequencies

In [21]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_frequency, y_train)

[[108  98 516]
 [ 10  26 685]
 [  0  18 704]]
Macro: 0.2844567913782739 || Micro: 0.3870669745958429
[[ 98 126 497]
 [  6  38 678]
 [  3  10 709]]
Macro: 0.2885551125492815 || Micro: 0.3903002309468822
[[107 117 497]
 [ 15  43 664]
 [  4  12 706]]
Macro: 0.29807920691772233 || Micro: 0.39538106235565823
[[116 131 475]
 [ 12  43 667]
 [  6  22 693]]
Macro: 0.3023211585737557 || Micro: 0.3935334872979215
[[116 109 497]
 [ 10  35 676]
 [  5  16 700]]
Macro: 0.2970478078166092 || Micro: 0.3932532347504621
---------------------------
Overall Macro: 0.29409201544712854 (+/- 0.0065703582476208024) || Overall Micro: 0.39190699798935336 (+/- 0.0029166580494660783)


## 1.4 Scaled one hot encoded pids and frequencies

In [22]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_more_scaled, y_train)

[[322 245 155]
 [ 79 501 141]
 [ 98 314 310]]
Macro: 0.5189697584165969 || Micro: 0.523325635103926
[[314 230 177]
 [ 81 482 159]
 [115 344 263]]
Macro: 0.4835065892770982 || Micro: 0.48914549653579675
[[307 260 154]
 [ 79 511 132]
 [106 357 259]]
Macro: 0.48915174291680197 || Micro: 0.497459584295612
[[329 268 125]
 [ 90 490 142]
 [124 304 293]]
Macro: 0.5089801900732355 || Micro: 0.5136258660508083
[[312 259 151]
 [ 90 478 153]
 [117 317 287]]
Macro: 0.49297061850014945 || Micro: 0.49768946395563773
---------------------------
Overall Macro: 0.4987157798367764 (+/- 0.013201355417420313) || Overall Micro: 0.5042492091883561 (+/- 0.012402283009364232)


## 1.5 Frequencies normalized

In [23]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_frequency_scaled, y_train)

[[ 85 213 424]
 [ 13 465 243]
 [  3 482 237]]
Macro: 0.3308307129476383 || Micro: 0.36351039260969975
[[206  67 448]
 [392  10 320]
 [445   5 272]]
Macro: 0.189058593565974 || Micro: 0.22540415704387992
[[220  68 433]
 [396  18 308]
 [429   0 293]]
Macro: 0.20913935412559045 || Micro: 0.24526558891454966
[[100  51 571]
 [ 16  11 695]
 [ 12   5 704]]
Macro: 0.26213436035185866 || Micro: 0.3764434180138569
[[ 87 206 429]
 [  8 380 333]
 [  5 424 292]]
Macro: 0.32658182913072414 || Micro: 0.35073937153419593
---------------------------
Overall Macro: 0.2635489700243571 (+/- 0.058328719968135934) || Overall Micro: 0.31227258562323645 (+/- 0.06365369385129427)


# 2. Decision Trees

## 2.1 Only one hot encoded columns

In [24]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot, y_train)

[[392 200 130]
 [113 456 152]
 [ 80 335 307]]
Macro: 0.533634011055515 || Micro: 0.5334872979214781
[[392 195 134]
 [114 436 172]
 [ 80 347 295]]
Macro: 0.5195814381747486 || Micro: 0.5187066974595843
[[382 206 133]
 [102 475 145]
 [ 79 363 280]]
Macro: 0.5234848070570332 || Micro: 0.5251732101616629
[[384 212 126]
 [ 88 483 151]
 [ 78 332 311]]
Macro: 0.5437536935872119 || Micro: 0.5441108545034642
[[362 208 152]
 [113 435 173]
 [ 96 336 289]]
Macro: 0.501553878588028 || Micro: 0.5018484288354899
---------------------------
Overall Macro: 0.5244015656925073 (+/- 0.014183326712314622) || Overall Micro: 0.524665297776336 (+/- 0.014229053123652035)


## 2.2 Only one hot encoded columns with frequencies

In [25]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_more, y_train)

[[390 184 148]
 [116 421 184]
 [ 85 315 322]]
Macro: 0.5250614763232201 || Micro: 0.523325635103926
[[387 192 142]
 [129 416 177]
 [ 93 337 292]]
Macro: 0.5063881981879659 || Micro: 0.5057736720554272
[[388 190 143]
 [134 419 169]
 [107 314 301]]
Macro: 0.5117245456013485 || Micro: 0.5117782909930716
[[377 217 128]
 [119 453 150]
 [ 85 362 274]]
Macro: 0.5085591468466265 || Micro: 0.5099307159353349
[[390 198 134]
 [127 408 186]
 [110 312 299]]
Macro: 0.5074464289496561 || Micro: 0.5069316081330869
---------------------------
Overall Macro: 0.5118359591817634 (+/- 0.006849967755087564) || Overall Micro: 0.5115479844441693 (+/- 0.006261553599670071)


## 2.3 Only with frequencies

In [26]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_frequency, y_train)

[[327 219 176]
 [112 253 356]
 [ 88 181 453]]
Macro: 0.4742141460136881 || Micro: 0.4771362586605081
[[348 177 196]
 [110 256 356]
 [ 76 218 428]]
Macro: 0.47680847959746303 || Micro: 0.4766743648960739
[[363 186 172]
 [116 234 372]
 [ 95 190 437]]
Macro: 0.475060363609982 || Micro: 0.4775981524249423
[[340 204 178]
 [106 246 370]
 [ 82 192 447]]
Macro: 0.47522761109857886 || Micro: 0.4771362586605081
[[355 172 195]
 [126 238 357]
 [ 83 205 433]]
Macro: 0.4720023742377755 || Micro: 0.47412199630314233
---------------------------
Overall Macro: 0.4746625949114975 (+/- 0.0015722699998068814) || Overall Micro: 0.4765334061890349 (+/- 0.0012405896780541183)


## 2.4 Scaled one hot encoded pids and frequencies

In [27]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_more_scaled, y_train)

[[391 183 148]
 [116 421 184]
 [ 85 315 322]]
Macro: 0.5255224219705296 || Micro: 0.5237875288683603
[[386 192 143]
 [127 416 179]
 [ 92 337 293]]
Macro: 0.5065323270604912 || Micro: 0.5057736720554272
[[388 189 144]
 [135 418 169]
 [107 314 301]]
Macro: 0.5112711543871202 || Micro: 0.5113163972286374
[[377 217 128]
 [119 453 150]
 [ 85 361 275]]
Macro: 0.5090680201522811 || Micro: 0.5103926096997691
[[392 196 134]
 [127 408 186]
 [110 312 299]]
Macro: 0.5083507845574983 || Micro: 0.5078558225508318
---------------------------
Overall Macro: 0.5121489416255841 (+/- 0.006856893179625532) || Overall Micro: 0.5118252060806052 (+/- 0.006289442820806848)


## 2.5 Frequencies normalized

In [28]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_frequency_scaled, y_train)

[[325 221 176]
 [112 251 358]
 [ 88 181 453]]
Macro: 0.47224740234366686 || Micro: 0.47528868360277143
[[348 177 196]
 [109 257 356]
 [ 76 218 428]]
Macro: 0.4773506304088979 || Micro: 0.4771362586605081
[[364 186 171]
 [114 236 372]
 [ 95 191 436]]
Macro: 0.4762647586253575 || Micro: 0.47852193995381065
[[337 207 178]
 [104 247 371]
 [ 82 192 447]]
Macro: 0.47438405664332084 || Micro: 0.4762124711316397
[[353 172 197]
 [126 238 357]
 [ 80 208 433]]
Macro: 0.4712156201094606 || Micro: 0.4731977818853974
---------------------------
Overall Macro: 0.47429249362614073 (+/- 0.0023194810879419232) || Overall Micro: 0.4760714270468254 (+/- 0.0017904913917507917)


# 3. Random Forest

## 3.1 Only one hot encoded columns

In [29]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot, y_train)

[[462 153 107]
 [123 444 154]
 [ 83 312 327]]
Macro: 0.5695900395779475 || Micro: 0.5695150115473441
[[460 140 121]
 [112 421 189]
 [ 89 321 312]]
Macro: 0.5516417504924891 || Micro: 0.5510392609699769
[[458 136 127]
 [115 449 158]
 [ 82 329 311]]
Macro: 0.5621748640956835 || Micro: 0.5625866050808314
[[446 174 102]
 [110 461 151]
 [ 81 315 325]]
Macro: 0.5693950974715651 || Micro: 0.5690531177829099
[[438 187  97]
 [124 443 154]
 [ 98 328 295]]
Macro: 0.5424086505347846 || Micro: 0.5434380776340111
---------------------------
Overall Macro: 0.559042080434494 (+/- 0.01058011919411825) || Overall Micro: 0.5591264146030147 (+/- 0.010295253968442023)


## 3.2 Only one hot encoded columns with frequencies

In [30]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_more, y_train)

[[473 133 116]
 [141 400 180]
 [ 93 281 348]]
Macro: 0.5642302918327688 || Micro: 0.563972286374134
[[474 129 118]
 [132 416 174]
 [ 95 308 319]]
Macro: 0.5578467632781211 || Micro: 0.5584295612009238
[[477 133 111]
 [143 415 164]
 [ 98 300 324]]
Macro: 0.5607200937179062 || Micro: 0.561662817551963
[[468 152 102]
 [146 434 142]
 [ 96 307 318]]
Macro: 0.5622685858261106 || Micro: 0.5635103926096998
[[463 140 119]
 [138 385 198]
 [104 290 327]]
Macro: 0.5431122558799722 || Micro: 0.5429759704251387
---------------------------
Overall Macro: 0.5576355981069758 (+/- 0.007555669843087196) || Overall Micro: 0.5581102056323719 (+/- 0.007814014649989896)


## 3.3 Only with frequencies

In [33]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_frequency, y_train)

[[378 174 170]
 [107 255 359]
 [ 80 184 458]]
Macro: 0.5019024632298744 || Micro: 0.5039260969976905
[[379 146 196]
 [114 239 369]
 [ 74 220 428]]
Macro: 0.48261522782080396 || Micro: 0.48314087759815244
[[395 163 163]
 [119 232 371]
 [ 92 186 444]]
Macro: 0.49126068431387404 || Micro: 0.49468822170900695
[[384 161 177]
 [111 250 361]
 [ 86 184 451]]
Macro: 0.49884838582596275 || Micro: 0.5011547344110855
[[389 155 178]
 [129 232 360]
 [ 83 200 438]]
Macro: 0.486334191376343 || Micro: 0.48937153419593343
---------------------------
Overall Macro: 0.4921921905133716 (+/- 0.007286925337509397) || Overall Micro: 0.49445629298237376 (+/- 0.007595965953456922)


## 3.4 Scaled one hot encoded pids and frequencies

In [34]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_more_scaled, y_train)

[[474 133 115]
 [140 401 180]
 [ 93 281 348]]
Macro: 0.5651421687542882 || Micro: 0.5648960739030023
[[473 130 118]
 [131 416 175]
 [ 95 306 321]]
Macro: 0.5584418953134768 || Micro: 0.558891454965358
[[475 134 112]
 [144 414 164]
 [100 300 322]]
Macro: 0.558329848377619 || Micro: 0.5593533487297921
[[466 153 103]
 [145 434 143]
 [ 96 307 318]]
Macro: 0.5614248952703076 || Micro: 0.5625866050808314
[[459 143 120]
 [136 385 200]
 [103 290 328]]
Macro: 0.5419952147055084 || Micro: 0.5415896487985212
---------------------------
Overall Macro: 0.55706680448424 (+/- 0.007935281173559001) || Overall Micro: 0.557463426295501 (+/- 0.00823503607808668)


## 3.5 Frequencies normalized

In [35]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_frequency_scaled, y_train)

[[380 171 171]
 [108 255 358]
 [ 81 183 458]]
Macro: 0.5027118468581019 || Micro: 0.5048498845265589
[[380 146 195]
 [112 242 368]
 [ 74 220 428]]
Macro: 0.4847128264586094 || Micro: 0.48498845265588914
[[394 164 163]
 [118 233 371]
 [ 94 186 442]]
Macro: 0.49050708532017745 || Micro: 0.49376443418013855
[[382 162 178]
 [111 249 362]
 [ 87 183 451]]
Macro: 0.49735938876584423 || Micro: 0.4997690531177829
[[388 156 178]
 [129 235 357]
 [ 83 201 437]]
Macro: 0.48706387926212313 || Micro: 0.4898336414048059
---------------------------
Overall Macro: 0.4924710053329712 (+/- 0.00666414977831259) || Overall Micro: 0.4946410931770351 (+/- 0.007036604103424382)


# 4. Neural Network

## 4.1 Only one hot encoded columns

In [36]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot, y_train)



[[409 186 127]
 [ 90 481 150]
 [ 74 320 328]]
Macro: 0.5630800785025155 || Micro: 0.5625866050808314




[[438 150 133]
 [109 414 199]
 [ 82 311 329]]
Macro: 0.5477127732921337 || Micro: 0.5454965357967667




[[397 166 158]
 [ 96 438 188]
 [ 72 310 340]]
Macro: 0.5452750732234604 || Micro: 0.5427251732101617




[[400 206 116]
 [ 99 482 141]
 [ 74 313 334]]
Macro: 0.5621320740458517 || Micro: 0.561662817551963
[[399 187 136]
 [100 451 170]
 [ 82 323 316]]
Macro: 0.5397624773844881 || Micro: 0.5388170055452866
---------------------------
Overall Macro: 0.5515924952896899 (+/- 0.00935904563946058) || Overall Micro: 0.5502576274370019 (+/- 0.009923464524608431)




## 4.2 Only one hot encoded columns with frequencies

In [38]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_more, y_train)

[[501 133  88]
 [145 444 132]
 [112 299 311]]
Macro: 0.5764927420592897 || Micro: 0.5801385681293303
[[456 122 143]
 [118 409 195]
 [ 77 307 338]]
Macro: 0.5575433106789855 || Micro: 0.5556581986143188
[[257 318 146]
 [ 44 533 145]
 [ 21 372 329]]
Macro: 0.5103980497919498 || Micro: 0.5168591224018476
[[513 120  89]
 [160 430 132]
 [116 290 315]]
Macro: 0.5769300060052878 || Micro: 0.5810623556581986
[[466 138 118]
 [144 407 170]
 [ 88 294 339]]
Macro: 0.560366753004777 || Micro: 0.5600739371534196
---------------------------
Overall Macro: 0.5563461723080578 (+/- 0.024324500630392067) || Overall Micro: 0.558758436391423 (+/- 0.023330106631198843)


## 4.3 Only with frequencies

In [39]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_frequency, y_train)

[[474 125 123]
 [151 219 351]
 [143 154 425]]
Macro: 0.5066400651986128 || Micro: 0.5163972286374134
[[412 149 160]
 [102 231 389]
 [ 78 221 423]]
Macro: 0.4920621599289466 || Micro: 0.492378752886836
[[452 102 167]
 [133 202 387]
 [ 92 153 477]]
Macro: 0.5111704180552583 || Micro: 0.5224018475750577
[[489  44 189]
 [203  74 445]
 [144  82 495]]
Macro: 0.4411278662454457 || Micro: 0.4886836027713626
[[404  26 292]
 [133  49 539]
 [ 68  37 616]]
Macro: 0.431601659870493 || Micro: 0.493992606284658
---------------------------
Overall Macro: 0.47652043385975124 (+/- 0.033525181416299545) || Overall Micro: 0.5027708076310655 (+/- 0.013817076681912333)


## 4.4 Scaled one hot encoded pids and frequencies

In [40]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_more_scaled, y_train)



[[434 187 101]
 [112 463 146]
 [ 82 313 327]]
Macro: 0.5658246092489957 || Micro: 0.5653579676674365




[[417 174 130]
 [100 445 177]
 [ 81 320 321]]
Macro: 0.5478915148333582 || Micro: 0.5464203233256351




[[413 169 139]
 [110 444 168]
 [ 80 320 322]]
Macro: 0.5457022995163032 || Micro: 0.5445727482678984




[[409 193 120]
 [105 473 144]
 [ 83 307 331]]
Macro: 0.5604394670327036 || Micro: 0.5602771362586605
[[405 188 129]
 [105 435 181]
 [ 86 320 315]]
Macro: 0.5351525189481875 || Micro: 0.5337338262476895
---------------------------
Overall Macro: 0.5510020819159097 (+/- 0.010933833012340986) || Overall Micro: 0.550072400353464 (+/- 0.011386992987563824)




## 4.5 Frequencies normalized

In [41]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_frequency_scaled, y_train)

[[478  64 180]
 [146  57 518]
 [134  54 534]]
Macro: 0.43991640831136736 || Micro: 0.49376443418013855
[[470 126 125]
 [154 183 385]
 [112 167 443]]
Macro: 0.4932085653910425 || Micro: 0.5062355658198614
[[433 152 136]
 [120 217 385]
 [ 84 212 426]]
Macro: 0.4937551125545528 || Micro: 0.4969976905311778
[[470 121 131]
 [163 186 373]
 [115 176 430]]
Macro: 0.48926897916487766 || Micro: 0.5016166281755197
[[475  58 189]
 [174  74 473]
 [137  63 521]]
Macro: 0.44627147825527763 || Micro: 0.4944547134935305
---------------------------
Overall Macro: 0.47248410873542357 (+/- 0.02413067628591187) || Overall Micro: 0.4986138064400456 (+/- 0.004701796475872791)


# 5. Logistic Regression

## 5.1 Only one hot encoded columns

In [42]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[456 152 114]
 [110 457 154]
 [ 88 283 351]]
Macro: 0.5843088772667501 || Micro: 0.5838337182448037


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[447 152 122]
 [124 438 160]
 [ 91 325 306]]
Macro: 0.5495738752972253 || Micro: 0.5501154734411086


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[446 156 119]
 [125 448 149]
 [ 87 321 314]]
Macro: 0.5574866871788088 || Micro: 0.5579676674364896


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[420 194 108]
 [100 473 149]
 [ 55 316 350]]
Macro: 0.5765318542806038 || Micro: 0.5741339491916859
[[455 170  97]
 [138 420 163]
 [ 89 298 334]]
Macro: 0.5593987774782433 || Micro: 0.5586876155268022
---------------------------
Overall Macro: 0.5654600143003263 (+/- 0.012888345318944868) || Overall Micro: 0.564947684768178 (+/- 0.012239063005370705)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.2 Only one hot encoded columns with frequencies

In [43]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_more, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[406 226  90]
 [ 69 519 133]
 [ 55 378 289]]
Macro: 0.559954828914633 || Micro: 0.5607390300230947


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[391 245  85]
 [ 75 519 128]
 [ 61 411 250]]
Macro: 0.531907749961489 || Micro: 0.535796766743649


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[364 235 122]
 [ 74 518 130]
 [ 39 404 279]]
Macro: 0.5347892585756789 || Micro: 0.5362586605080831


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[381 249  92]
 [ 81 521 120]
 [ 62 383 276]]
Macro: 0.5412886660196555 || Micro: 0.5441108545034642
[[375 206 141]
 [ 86 444 191]
 [ 56 314 351]]
Macro: 0.5441099479574559 || Micro: 0.5406654343807763
---------------------------
Overall Macro: 0.5424100902857825 (+/- 0.009801440611300815) || Overall Micro: 0.5435141492318134 (+/- 0.009136387260251065)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.3 Only with frequencies

In [44]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_frequency, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[413 309   0]
 [ 85 636   0]
 [ 78 644   0]]
Macro: 0.3956709956709957 || Micro: 0.48452655889145496


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[403 318   0]
 [ 99 623   0]
 [ 74 648   0]]
Macro: 0.3868648717357601 || Micro: 0.47390300230946886


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[402 184 135]
 [ 94 250 378]
 [ 59 249 414]]
Macro: 0.49602947616402454 || Micro: 0.492378752886836


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[394 328   0]
 [102 620   0]
 [ 74 647   0]]
Macro: 0.38169397191218674 || Micro: 0.46836027713625866
[[393 223 106]
 [109 321 291]
 [ 79 284 358]]
Macro: 0.5009263747557401 || Micro: 0.4953789279112754
---------------------------
Overall Macro: 0.43223713804774144 (+/- 0.05429182370021794) || Overall Micro: 0.48290950382705883 (+/- 0.010397741569360757)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.4 Scaled one hot encoded pids and frequencies

In [45]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_more_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[456 152 114]
 [111 456 154]
 [ 89 282 351]]
Macro: 0.5838088925450436 || Micro: 0.5833718244803695


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[447 152 122]
 [124 438 160]
 [ 91 325 306]]
Macro: 0.5495738752972253 || Micro: 0.5501154734411086


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[446 156 119]
 [126 448 148]
 [ 87 321 314]]
Macro: 0.5574536459043163 || Micro: 0.5579676674364896


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[424 190 108]
 [104 469 149]
 [ 57 314 350]]
Macro: 0.5764382807199122 || Micro: 0.5741339491916859
[[454 170  98]
 [138 420 163]
 [ 89 298 334]]
Macro: 0.558948927341896 || Micro: 0.5582255083179297
---------------------------
Overall Macro: 0.5652447243616787 (+/- 0.012773703600202782) || Overall Micro: 0.5647628845735166 (+/- 0.012145521097614666)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.5 Frequencies normalized

In [46]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_frequency_scaled, y_train)

[[413 129 180]
 [ 85 117 519]
 [ 78 110 534]]
Macro: 0.4666417971506703 || Micro: 0.4914549653579677
[[403 129 189]
 [ 99 114 509]
 [ 74  81 567]]
Macro: 0.47003897416056745 || Micro: 0.5006928406466513
[[402 109 210]
 [ 94 117 511]
 [ 59 140 523]]
Macro: 0.4590707780782742 || Micro: 0.4812933025404157
[[395 118 209]
 [102 122 498]
 [ 74 107 540]]
Macro: 0.4626711337684785 || Micro: 0.4882217090069284
[[393 140 189]
 [109 135 477]
 [ 79 119 523]]
Macro: 0.4643399254001463 || Micro: 0.4856746765249538
---------------------------
Overall Macro: 0.46455252171162736 (+/- 0.003690543683002763) || Overall Micro: 0.4894674988153834 (+/- 0.0065224734633731)


# 6. XGBoost

## 6.1 Only one hot encoded columns

In [47]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xg, X_train_one_hot, y_train)

[[459 150 113]
 [126 435 160]
 [ 93 284 345]]
Macro: 0.572603156723123 || Micro: 0.5722863741339492
[[453 153 115]
 [119 437 166]
 [ 93 320 309]]
Macro: 0.5534184849105941 || Micro: 0.553810623556582
[[441 154 126]
 [107 470 145]
 [ 66 336 320]]
Macro: 0.5689886726024831 || Micro: 0.5685912240184757
[[427 187 108]
 [103 472 147]
 [ 69 302 350]]
Macro: 0.5784288525527338 || Micro: 0.576905311778291
[[427 186 109]
 [116 440 165]
 [ 75 312 334]]
Macro: 0.5567956720045663 || Micro: 0.5549907578558225
---------------------------
Overall Macro: 0.5660469677587001 (+/- 0.009486973852111563) || Overall Micro: 0.5653168582686241 (+/- 0.009301716660814766)


## 6.2 Only one hot encoded columns with frequencies

In [48]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xg, X_train_one_hot_more, y_train)

[[461 134 127]
 [108 431 182]
 [ 69 273 380]]
Macro: 0.5898282675280982 || Micro: 0.5875288683602772
[[456 145 120]
 [107 438 177]
 [ 66 312 344]]
Macro: 0.5740228060068747 || Micro: 0.571824480369515
[[453 140 128]
 [125 430 167]
 [ 88 310 324]]
Macro: 0.5577529345598373 || Micro: 0.5575057736720554
[[430 184 108]
 [108 471 143]
 [ 75 317 329]]
Macro: 0.5686799454059495 || Micro: 0.5681293302540416
[[439 163 120]
 [126 423 172]
 [ 85 311 325]]
Macro: 0.5495363766521737 || Micro: 0.5485212569316081
---------------------------
Overall Macro: 0.5679640660305867 (+/- 0.013849989619088747) || Overall Micro: 0.5667019419174995 (+/- 0.013244965229575165)


## 6.3 Only with frequencies

In [49]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xg, X_train_one_hot_frequency, y_train)

[[415 155 152]
 [110 267 344]
 [ 83 198 441]]
Macro: 0.5179720036079778 || Micro: 0.5187066974595843
[[410 144 167]
 [121 253 348]
 [ 84 223 415]]
Macro: 0.4977476476412474 || Micro: 0.49792147806004616
[[415 155 151]
 [112 258 352]
 [ 80 199 443]]
Macro: 0.5143272188845626 || Micro: 0.515473441108545
[[405 158 159]
 [109 252 361]
 [ 83 192 446]]
Macro: 0.5078385012409767 || Micro: 0.5094688221709007
[[400 138 184]
 [129 233 359]
 [100 190 431]]
Macro: 0.48806769069419514 || Micro: 0.49168207024029575
---------------------------
Overall Macro: 0.5051906124137919 (+/- 0.010976748395790515) || Overall Micro: 0.5066505018078744 (+/- 0.010309110925271)


## 5.4 Scaled one hot encoded pids and frequencies

In [50]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xg, X_train_one_hot_more_scaled, y_train)

[[462 134 126]
 [108 431 182]
 [ 70 273 379]]
Macro: 0.5897676086885398 || Micro: 0.5875288683602772
[[455 146 120]
 [107 438 177]
 [ 66 312 344]]
Macro: 0.5735839325281199 || Micro: 0.5713625866050809
[[453 140 128]
 [125 430 167]
 [ 88 310 324]]
Macro: 0.5577529345598373 || Micro: 0.5575057736720554
[[430 184 108]
 [108 471 143]
 [ 75 317 329]]
Macro: 0.5686799454059495 || Micro: 0.5681293302540416
[[439 163 120]
 [126 423 172]
 [ 85 311 325]]
Macro: 0.5495363766521737 || Micro: 0.5485212569316081
---------------------------
Overall Macro: 0.567864159566924 (+/- 0.013793381676732821) || Overall Micro: 0.5666095631646126 (+/- 0.013210481135862186)


## 6.5 Frequencies normalized

In [51]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xg, X_train_one_hot_frequency_scaled, y_train)

[[415 155 152]
 [110 267 344]
 [ 84 198 440]]
Macro: 0.5175205091083818 || Micro: 0.5182448036951501
[[410 144 167]
 [121 253 348]
 [ 84 223 415]]
Macro: 0.4977476476412474 || Micro: 0.49792147806004616
[[415 155 151]
 [112 258 352]
 [ 80 199 443]]
Macro: 0.5143272188845626 || Micro: 0.515473441108545
[[405 158 159]
 [109 252 361]
 [ 83 192 446]]
Macro: 0.5078385012409767 || Micro: 0.5094688221709007
[[401 138 183]
 [129 233 359]
 [100 190 431]]
Macro: 0.48851486254527837 || Micro: 0.4921441774491682
---------------------------
Overall Macro: 0.5051897478840893 (+/- 0.010733066664170366) || Overall Micro: 0.5066505444967621 (+/- 0.010068210623604195)
