In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, KFold # for cross validation
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics # for f1 macro in cross validation

import xgboost as xgb

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
from helper import util_visualizations, util_ml

# 0. Load Data

## 0.1 Training

In [2]:
# read data
train = pd.read_csv('/path/to/9_FINAL/data/machine_learning/one_hot_encoded/train/upsampled_training_data.csv', sep=";")
del train['Unnamed: 0']

In [3]:
# get label
y_train = train['label']

In [4]:
# get variables
X_train = train.copy()
del X_train['label']
X_train.shape

(331875, 65)

In [5]:
X_train

Unnamed: 0,instance,class,frequency,pidspread,pldspread,id,pids,p1,p10,p11,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,glomalin,biomolecule,2,1,1,82829299,['p5'],0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,endocrinologist,anesthesiologist,1,1,1,187218903,['p1'],1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,election,event,1657,37,1039,362225264,['p6p5p4p2p1p8dp10p12ap8ap8cp8bp43p16p15ap23dp...,1,1,0,...,1,0,1,1,1,0,1,1,1,1
3,econometrician,person,1,1,1,54195916,['p8a'],0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,restaurant,place,13738,44,5915,475994031,['p7p6p5p4p2p1p12ap12cp43p26p15ap25p20cp20bp20...,1,1,1,...,1,0,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331870,dejene,runner,1,1,1,397559175,['p8a'],0,0,0,...,0,0,0,0,0,0,1,0,0,0
331871,unless,suite,1,1,1,113361288,['p21a'],0,0,0,...,0,0,0,0,0,0,0,0,0,0
331872,port,place,1017,32,586,87144960,['p6p5p4p1p10p8ap8cp8bp43p26p34p23dp25p20cp20a...,1,1,0,...,1,0,1,1,1,1,1,1,1,0
331873,youngberry,berry,3,3,3,473121725,"['p4p28b', 'p1']",1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
# only take one hot encoded columns
X_train_one_hot = X_train.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_train_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,1,0,0,0,0,1,1,...,1,0,1,1,1,0,1,1,1,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,1,1,1,0,1,0,1,1,0,...,1,0,1,1,1,1,1,1,1,1


In [7]:
# only take one hot encoded columns
X_train_one_hot_more = X_train.drop(['instance', 'class', 'id', 'pids'], axis=1) 
X_train_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,2,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1657,37,1039,1,1,0,1,0,0,0,...,1,0,1,1,1,0,1,1,1,1
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,13738,44,5915,1,1,1,1,0,1,0,...,1,0,1,1,1,1,1,1,1,1


In [8]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_train_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot_more))
X_train_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,1.5e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.025538,0.72,0.05334,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.211847,0.86,0.303905,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# only take one hot encoded columns
X_train_one_hot_frequency = X_train[['frequency', 'pidspread', 'pldspread']]
X_train_one_hot_frequency.head()

Unnamed: 0,frequency,pidspread,pldspread
0,2,1,1
1,1,1,1
2,1657,37,1039
3,1,1,1
4,13738,44,5915


In [10]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_train_one_hot_frequency_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot_frequency))
X_train_one_hot_frequency_scaled.head()

Unnamed: 0,0,1,2
0,1.5e-05,0.0,0.0
1,0.0,0.0,0.0
2,0.025538,0.72,0.05334
3,0.0,0.0,0.0
4,0.211847,0.86,0.303905


## 0.2 Testing

In [11]:
# read test data
test = pd.read_csv('/path/to/9_FINAL/data/machine_learning/one_hot_encoded/test/test_data.csv', sep=";")
del test['Unnamed: 0']

In [12]:
# get label
y_test = test['label']

In [13]:
# get variables
X_test = test.copy()
del X_test['label']
X_test.shape

(348121, 65)

In [14]:
# only take one hot encoded columns
X_test_one_hot = X_test.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_test_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [15]:
# only take one hot encoded columns
X_test_one_hot_more = X_test.drop(['instance', 'class',  'id', 'pids'], axis=1)
X_test_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,3,2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,134,10,93,1,1,0,0,0,0,0,...,0,0,1,1,0,0,1,0,0,0
4,3,2,3,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [16]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_more))
X_test_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,3.1e-05,0.020833,0.000132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.5e-05,0.020833,0.000132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.002055,0.1875,0.012168,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3.1e-05,0.020833,0.000265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [17]:
# only take one hot encoded columns
X_test_one_hot_frequency = X_test[['frequency', 'pidspread', 'pldspread']]
X_test_one_hot_frequency.head()

Unnamed: 0,frequency,pidspread,pldspread
0,3,2,2
1,1,1,1
2,2,2,2
3,134,10,93
4,3,2,3


In [18]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_frequency_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_frequency))
X_test_one_hot_frequency_scaled.head()

Unnamed: 0,0,1,2
0,3.1e-05,0.020833,0.000132
1,0.0,0.0,0.0
2,1.5e-05,0.020833,0.000132
3,0.002055,0.1875,0.012168
4,3.1e-05,0.020833,0.000265


# 1. Naive Bayes

## 1.1 Only one hot encoded columns

In [19]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot, y_train)

[[ 9787  7711  4627]
 [ 2500 15166  4459]
 [ 3511  9823  8791]]
Macro: 0.5029768086561331 || Micro: 0.5083841807909605
[[ 9863  7597  4665]
 [ 2536 15164  4425]
 [ 3664 10038  8423]]
Macro: 0.49790903768790273 || Micro: 0.503954802259887
[[ 9670  7779  4676]
 [ 2370 15045  4710]
 [ 3207 10041  8877]]
Macro: 0.501425382107478 || Micro: 0.5060941619585687
[[ 9864  7587  4674]
 [ 2471 15195  4459]
 [ 3375 10048  8702]]
Macro: 0.5033184689014804 || Micro: 0.5086403013182674
[[ 9752  7741  4632]
 [ 2545 15038  4542]
 [ 3643  9981  8501]]
Macro: 0.49585159369360093 || Micro: 0.5015593220338983
---------------------------
Overall Macro: 0.5002962582093191 (+/- 0.002934208357232934) || Overall Micro: 0.5057265536723164 (+/- 0.002690463368783801)


## 1.2 Only one hot encoded columns with frequencies

In [20]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_more, y_train)

[[ 3450  3536 15139]
 [  382  1298 20445]
 [  130   511 21484]]
Macro: 0.300525286893329 || Micro: 0.39520903954802256
[[ 3467  3631 15027]
 [  374  1346 20405]
 [  114   527 21484]]
Macro: 0.302308230967024 || Micro: 0.39618832391713754
[[ 3451  3569 15105]
 [  392  1243 20490]
 [  127   514 21484]]
Macro: 0.2991847771783367 || Micro: 0.3943954802259887
[[ 3557  3612 14956]
 [  399  1326 20400]
 [  122   534 21469]]
Macro: 0.30381874557423366 || Micro: 0.3970169491525424
[[ 3484  3510 15131]
 [  366  1189 20570]
 [  139   482 21504]]
Macro: 0.29868585533354214 || Micro: 0.39438041431261767
---------------------------
Overall Macro: 0.3009045791892931 (+/- 0.001922858350632753) || Overall Micro: 0.39543804143126177 (+/- 0.001030904869742153)


## 1.3 Only with frequencies

In [21]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_frequency, y_train)

[[ 3016  3465 15644]
 [  345  1022 20758]
 [  117   458 21550]]
Macro: 0.2831124283496078 || Micro: 0.3855065913370998
[[ 3029  3629 15467]
 [  341  1038 20746]
 [  110   475 21540]]
Macro: 0.28401762349089976 || Micro: 0.3857928436911488
[[ 3055  3427 15643]
 [  352   977 20796]
 [  111   467 21547]]
Macro: 0.28286045687073263 || Micro: 0.38537099811676084
[[ 3100  3602 15423]
 [  362  1034 20729]
 [  114   490 21521]]
Macro: 0.28550327501465994 || Micro: 0.3865160075329566
[[ 3090  3425 15610]
 [  320   931 20874]
 [  138   421 21566]]
Macro: 0.2826357302870683 || Micro: 0.38549152542372883
---------------------------
Overall Macro: 0.2836259028025937 (+/- 0.0010495691134231415) || Overall Micro: 0.38573559322033896 (+/- 0.000414068752061486)


## 1.4 Scaled one hot encoded pids and frequencies

In [22]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_more_scaled, y_train)

[[ 9792  7724  4609]
 [ 2491 15168  4466]
 [ 3496  9825  8804]]
Macro: 0.5033274598589564 || Micro: 0.5086854990583805
[[ 9809  7609  4707]
 [ 2476 15165  4484]
 [ 3423 10042  8660]]
Macro: 0.5012829035363654 || Micro: 0.5067269303201507
[[ 9661  7816  4648]
 [ 2367 15048  4710]
 [ 3206 10041  8878]]
Macro: 0.5013431076120302 || Micro: 0.5060188323917137
[[ 9806  7592  4727]
 [ 2358 15187  4580]
 [ 3147 10038  8940]]
Macro: 0.5065331663511936 || Micro: 0.5112316384180791
[[ 9670  7718  4737]
 [ 2380 15024  4721]
 [ 3177  9967  8981]]
Macro: 0.5028548115930411 || Micro: 0.5073446327683616
---------------------------
Overall Macro: 0.5030682897903173 (+/- 0.0019121699334201812) || Overall Micro: 0.5080015065913371 (+/- 0.0018378586553561378)


## 1.5 Frequencies normalized

In [27]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_confusion_matrix(nb, X_train_one_hot_frequency_scaled, y_train)

[[ 2433 14020  5858]
 [  185 19829  1976]
 [   46 19795  2233]]
Macro: 0.28604199672593233 || Micro: 0.36903954802259886
[[ 3079   999 18042]
 [  356   130 21779]
 [  108    53 21829]]
Macro: 0.25767344252504176 || Micro: 0.37722033898305085
[[ 6946 13185  1901]
 [12225  9410   455]
 [13467  7963   823]]
Macro: 0.2254321791100435 || Micro: 0.25881732580037664
[[ 2774  2026 17346]
 [  378   359 21428]
 [  139   105 21820]]
Macro: 0.2583959798338964 || Micro: 0.375939736346516
[[ 9055  9442  3519]
 [15697  5405  1013]
 [16881  3953  1410]]
Macro: 0.21626172209230676 || Micro: 0.2390960451977401
---------------------------
Overall Macro: 0.24876106405744416 (+/- 0.025149847449467248) || Overall Micro: 0.3240225988700565 (+/- 0.06167035200585484)


# 2. Decision Trees

## 2.1 Only one hot encoded columns

In [23]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot, y_train)

[[16158  4020  1947]
 [ 1318 15933  4874]
 [ 1065 10842 10218]]
Macro: 0.6395429000127781 || Micro: 0.6374237288135594
[[16409  3713  2003]
 [ 1441 15746  4938]
 [ 1151 11039  9935]]
Macro: 0.6353021125894841 || Micro: 0.6341242937853108
[[16357  3794  1974]
 [ 1429 15509  5187]
 [ 1161 10963 10001]]
Macro: 0.6325590364281758 || Micro: 0.6307645951035782
[[16359  3795  1971]
 [ 1364 15751  5010]
 [ 1183 11005  9937]]
Macro: 0.6348196189881493 || Micro: 0.6334764595103578
[[16399  3808  1918]
 [ 1517 15649  4959]
 [ 1154 10980  9991]]
Macro: 0.6346882398788629 || Micro: 0.6333559322033898
---------------------------
Overall Macro: 0.63538238157949 (+/- 0.00228412648610791) || Overall Micro: 0.6338290018832391 (+/- 0.002132975807124874)


## 2.2 Only one hot encoded columns with frequencies

In [24]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_more, y_train)

[[17599  2956  1570]
 [ 1076 15617  5432]
 [  853 10884 10388]]
Macro: 0.6587751658643349 || Micro: 0.6569340866290019
[[17640  2860  1625]
 [  996 15586  5543]
 [  764 11106 10255]]
Macro: 0.6571424521349695 || Micro: 0.6550809792843691
[[17618  2864  1643]
 [  978 15530  5617]
 [  762 11099 10264]]
Macro: 0.6562931557363901 || Micro: 0.6540414312617703
[[17636  2852  1637]
 [  973 15537  5615]
 [  852 11064 10209]]
Macro: 0.6554748783402307 || Micro: 0.6535894538606403
[[17727  2836  1562]
 [ 1042 15489  5594]
 [  784 11065 10276]]
Macro: 0.6571860320519455 || Micro: 0.65524670433145
---------------------------
Overall Macro: 0.6569743368255742 (+/- 0.0010980388224918183) || Overall Micro: 0.6549785310734463 (+/- 0.0011588185037592499)


## 2.3 Only with frequencies

In [25]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_frequency, y_train)

[[14819  2582  4724]
 [ 1979  7652 12494]
 [ 1627  5444 15054]]
Macro: 0.5630740412878196 || Micro: 0.5653483992467043
[[14976  2627  4522]
 [ 1984  7505 12636]
 [ 1660  5481 14984]]
Macro: 0.5616944703714081 || Micro: 0.5644444444444444
[[14954  2485  4686]
 [ 2084  7532 12509]
 [ 1722  5446 14957]]
Macro: 0.5611390892797372 || Micro: 0.5641129943502825
[[14897  2544  4684]
 [ 1928  7504 12693]
 [ 1665  5394 15066]]
Macro: 0.5617542207204406 || Micro: 0.5644745762711865
[[14881  2588  4656]
 [ 1958  7441 12726]
 [ 1611  5442 15072]]
Macro: 0.5605487238762286 || Micro: 0.5633747645951036
---------------------------
Overall Macro: 0.5616421091071268 (+/- 0.0008383627730086719) || Overall Micro: 0.5643510357815443 (+/- 0.0006369443405813906)


## 2.4 Scaled one hot encoded pids and frequencies

In [26]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_more_scaled, y_train)

[[17599  2956  1570]
 [ 1077 15613  5435]
 [  850 10892 10383]]
Macro: 0.6586458575120114 || Micro: 0.6567984934086629
[[17640  2860  1625]
 [  997 15585  5543]
 [  765 11105 10255]]
Macro: 0.6571236936107154 || Micro: 0.6550659133709981
[[17618  2864  1643]
 [  980 15532  5613]
 [  762 11093 10270]]
Macro: 0.656413057962379 || Micro: 0.6541619585687383
[[17636  2852  1637]
 [  970 15542  5613]
 [  852 11066 10207]]
Macro: 0.6555163315878625 || Micro: 0.6536346516007533
[[17727  2836  1562]
 [ 1041 15491  5593]
 [  787 11056 10282]]
Macro: 0.6573047720046635 || Micro: 0.655367231638418
---------------------------
Overall Macro: 0.6570007425355264 (+/- 0.0010356450347401057) || Overall Micro: 0.6550056497175142 (+/- 0.0010899644292390747)


## 2.5 Frequencies normalized

In [27]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(dt, X_train_one_hot_frequency_scaled, y_train)

[[14819  2582  4724]
 [ 1973  7656 12496]
 [ 1625  5443 15057]]
Macro: 0.5632017505210434 || Micro: 0.5654538606403013
[[14976  2627  4522]
 [ 1987  7509 12629]
 [ 1664  5478 14983]]
Macro: 0.5617343818157754 || Micro: 0.5644896421845574
[[14954  2485  4686]
 [ 2083  7527 12515]
 [ 1726  5447 14952]]
Macro: 0.5609819290833067 || Micro: 0.5639623352165725
[[14897  2544  4684]
 [ 1926  7510 12689]
 [ 1665  5393 15067]]
Macro: 0.5618773408530301 || Micro: 0.5645800376647834
[[14881  2588  4656]
 [ 1956  7445 12724]
 [ 1608  5439 15078]]
Macro: 0.5607060668671727 || Micro: 0.5635254237288135
---------------------------
Overall Macro: 0.5617002938280657 (+/- 0.0008707944645117427) || Overall Micro: 0.5644022598870057 (+/- 0.0006492272502749616)


# 3. Random Forest

## 3.1 Only one hot encoded columns

In [28]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot, y_train)

[[16158  4020  1947]
 [ 1123 16225  4777]
 [  916 10607 10602]]
Macro: 0.6504850589105517 || Micro: 0.647608286252354
[[16387  3725  2013]
 [ 1226 16038  4861]
 [  992 10776 10357]]
Macro: 0.6466858310661608 || Micro: 0.6445499058380414
[[16346  3794  1985]
 [ 1212 15858  5055]
 [  964 10695 10466]]
Macro: 0.6456332139271375 || Micro: 0.6428625235404897
[[16352  3795  1978]
 [ 1180 16064  4881]
 [ 1011 10764 10350]]
Macro: 0.6465057754816393 || Micro: 0.6443088512241054
[[16395  3812  1918]
 [ 1282 15970  4873]
 [  960 10726 10439]]
Macro: 0.6472578963317758 || Micro: 0.6448813559322034
---------------------------
Overall Macro: 0.647313555143453 (+/- 0.0016692289328033912) || Overall Micro: 0.6448421845574388 (+/- 0.0015454308149832432)


## 3.2 Only one hot encoded columns with frequencies

In [29]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_more, y_train)

[[17596  2979  1550]
 [  813 16171  5141]
 [  687 10578 10860]]
Macro: 0.6749661920958349 || Micro: 0.6723465160075329
[[17635  2860  1630]
 [  759 16104  5262]
 [  626 10718 10781]]
Macro: 0.6735634555685045 || Micro: 0.6707344632768362
[[17609  2848  1668]
 [  722 15984  5419]
 [  618 10792 10715]]
Macro: 0.6706362185754889 || Micro: 0.6675404896421846
[[17631  2855  1639]
 [  732 16060  5333]
 [  671 10665 10789]]
Macro: 0.6729782324599514 || Micro: 0.6701318267419962
[[17677  2857  1591]
 [  800 16041  5284]
 [  574 10749 10802]]
Macro: 0.6736907802951767 || Micro: 0.6707344632768362
---------------------------
Overall Macro: 0.6731669757989913 (+/- 0.0014219073678004525) || Overall Micro: 0.6702975517890772 (+/- 0.0015626192202735393)


## 3.3 Only with frequencies

In [30]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_frequency, y_train)

[[14805  2577  4743]
 [ 1992  7673 12460]
 [ 1615  5361 15149]]
Macro: 0.5644241604021085 || Micro: 0.5668851224105461
[[14835  2641  4649]
 [ 1939  7598 12588]
 [ 1537  5435 15153]]
Macro: 0.5638979028211896 || Micro: 0.5662674199623352
[[14893  2515  4717]
 [ 2076  7554 12495]
 [ 1698  5374 15053]]
Macro: 0.5619546096548045 || Micro: 0.5649717514124294
[[14854  2587  4684]
 [ 1913  7512 12700]
 [ 1636  5327 15162]]
Macro: 0.5625944338632674 || Micro: 0.5653935969868173
[[14851  2530  4744]
 [ 1943  7430 12752]
 [ 1558  5319 15248]]
Macro: 0.5622932293972855 || Micro: 0.5654086629001883
---------------------------
Overall Macro: 0.563032867227731 (+/- 0.0009576962570558639) || Overall Micro: 0.5657853107344633 (+/- 0.0006927044177821421)


## 3.4 Scaled one hot encoded pids and frequencies

In [31]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_more_scaled, y_train)

[[17596  2979  1550]
 [  811 16169  5145]
 [  688 10572 10865]]
Macro: 0.6750220711826125 || Micro: 0.672391713747646
[[17635  2860  1630]
 [  761 16107  5257]
 [  624 10718 10783]]
Macro: 0.6736371781654694 || Micro: 0.6708097928436911
[[17609  2848  1668]
 [  722 15973  5430]
 [  614 10789 10722]]
Macro: 0.6706142370724711 || Micro: 0.6674802259887006
[[17631  2855  1639]
 [  735 16069  5321]
 [  670 10672 10783]]
Macro: 0.6729973916241647 || Micro: 0.6701770244821093
[[17677  2857  1591]
 [  802 16038  5285]
 [  577 10742 10806]]
Macro: 0.6737015623517734 || Micro: 0.6707495291902071
---------------------------
Overall Macro: 0.6731944880792982 (+/- 0.0014484268086844146) || Overall Micro: 0.6703216572504708 (+/- 0.0016000709302232971)


## 3.5 Frequencies normalized

In [32]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(rf, X_train_one_hot_frequency_scaled, y_train)

[[14805  2577  4743]
 [ 1986  7679 12460]
 [ 1615  5369 15141]]
Macro: 0.5644452052817351 || Micro: 0.5668549905838042
[[14835  2641  4649]
 [ 1937  7605 12583]
 [ 1539  5433 15153]]
Macro: 0.5640206201254973 || Micro: 0.5663728813559322
[[14893  2515  4717]
 [ 2071  7561 12493]
 [ 1697  5376 15052]]
Macro: 0.5620804185984477 || Micro: 0.5650621468926553
[[14854  2587  4684]
 [ 1910  7513 12702]
 [ 1634  5323 15168]]
Macro: 0.5626990965588666 || Micro: 0.5654990583804144
[[14851  2530  4744]
 [ 1944  7423 12758]
 [ 1560  5313 15252]]
Macro: 0.5622111697350447 || Micro: 0.5653634651600753
---------------------------
Overall Macro: 0.5630913020599182 (+/- 0.0009640569939661027) || Overall Micro: 0.5658305084745763 (+/- 0.0006726217477409517)


# 4. Neural Network

## 4.1 Only one hot encoded columns

In [33]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot, y_train)



[[15705  4475  1945]
 [ 1143 16422  4560]
 [ 1020 10612 10493]]
Macro: 0.6447236661247663 || Micro: 0.6421092278719397




[[16325  3917  1883]
 [ 1537 15919  4669]
 [ 1351 10539 10235]]
Macro: 0.640775691681232 || Micro: 0.639984934086629




[[15894  4105  2126]
 [ 1315 15958  4852]
 [  930 10658 10537]]
Macro: 0.6416892055369937 || Micro: 0.6386290018832391




[[16372  3875  1878]
 [ 1584 15975  4566]
 [ 1386 10625 10114]]
Macro: 0.639990482331512 || Micro: 0.639713747645951




[[15729  4147  2249]
 [ 1236 15671  5218]
 [  826 10300 10999]]
Macro: 0.6433250262422159 || Micro: 0.6387796610169492
---------------------------
Overall Macro: 0.642100814383344 (+/- 0.0017179592148779397) || Overall Micro: 0.6398433145009416 (+/- 0.0012471664643784375)


## 4.2 Only one hot encoded columns with frequencies

In [34]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_more, y_train)



[[16523  3826  1776]
 [ 2110 15608  4407]
 [ 1415 10369 10341]]
Macro: 0.6399485864067874 || Micro: 0.639879472693032
[[16326  3915  1884]
 [ 2033 15468  4624]
 [ 1330 10440 10355]]
Macro: 0.6358648212232884 || Micro: 0.6350131826741996




[[16696  3628  1801]
 [ 2505 14761  4859]
 [ 1722 10078 10325]]
Macro: 0.6290737326802597 || Micro: 0.6294839924670433
[[16410  3843  1872]
 [ 2111 15324  4690]
 [ 1432 10223 10470]]
Macro: 0.6365612784379812 || Micro: 0.6358418079096045
[[15250  4306  2569]
 [ 3536 13959  4630]
 [ 2391  9690 10044]]
Macro: 0.5906952166606901 || Micro: 0.5913822975517891
---------------------------
Overall Macro: 0.6264287270818013 (+/- 0.018211946010416237) || Overall Micro: 0.6263201506591336 (+/- 0.017780604326992)


## 4.3 Only with frequencies

In [35]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_frequency, y_train)

[[13329  3576  5220]
 [ 3916  6493 11716]
 [ 2684  4994 14447]]
Macro: 0.50769731939546 || Micro: 0.5162937853107344
[[13967  3601  4557]
 [ 4395  6217 11513]
 [ 3103  4955 14067]]
Macro: 0.5053814047598079 || Micro: 0.5160225988700565
[[13916  3323  4886]
 [ 4373  6069 11683]
 [ 3232  4608 14285]]
Macro: 0.5043153744953494 || Micro: 0.5163088512241054
[[13373  3442  5310]
 [ 3847  6195 12083]
 [ 2735  4709 14681]]
Macro: 0.5056883809407525 || Micro: 0.5159924670433145
[[13361  3668  5096]
 [ 3828  6297 12000]
 [ 2711  4896 14518]]
Macro: 0.505560684542201 || Micro: 0.5148926553672316
---------------------------
Overall Macro: 0.5057286328267142 (+/- 0.001097580984545039) || Overall Micro: 0.5159020715630885 (+/- 0.0005216375338758338)


## 4.4 Scaled one hot encoded pids and frequencies

In [36]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_more_scaled, y_train)



[[16066  4083  1976]
 [ 1434 16089  4602]
 [ 1156 10405 10564]]
Macro: 0.6455136512496349 || Micro: 0.6436007532956686




[[16454  3600  2071]
 [ 1857 15192  5076]
 [ 1399 10109 10617]]
Macro: 0.6381341415272982 || Micro: 0.6367306967984934




[[16081  3975  2069]
 [ 1360 15793  4972]
 [ 1096 10509 10520]]
Macro: 0.6412948805071546 || Micro: 0.6387043314500942




[[15561  4400  2164]
 [ 1075 16149  4901]
 [  824 10659 10642]]
Macro: 0.6420474757765838 || Micro: 0.6380715630885122
[[17287  3181  1657]
 [ 2768 14736  4621]
 [ 2347  9675 10103]]
Macro: 0.6313391042040658 || Micro: 0.6346666666666667
---------------------------
Overall Macro: 0.6396658506529475 (+/- 0.004779194232998117) || Overall Micro: 0.6383548022598869 (+/- 0.0029652136535680764)




## 4.5 Frequencies normalized

In [37]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(nnet, X_train_one_hot_frequency_scaled, y_train)

[[13974  2114  6037]
 [ 4484  3274 14367]
 [ 3312  2305 16508]]
Macro: 0.47184754258200007 || Micro: 0.5085649717514125
[[14536  3037  4552]
 [ 4924  4476 12725]
 [ 3723  3721 14681]]
Macro: 0.4843041099547077 || Micro: 0.5076158192090395
[[14194  4213  3718]
 [ 4725  6584 10816]
 [ 3630  5853 12642]]
Macro: 0.4959659357911403 || Micro: 0.5035028248587571
[[13949  1354  6822]
 [ 4236  1979 15910]
 [ 3090  1311 17724]]
Macro: 0.45236751934827035 || Micro: 0.5069981167608286
[[14403  4858  2864]
 [ 4723  8326  9076]
 [ 3799  7326 11000]]
Macro: 0.5060592416996376 || Micro: 0.5081581920903955
---------------------------
Overall Macro: 0.4821088698751512 (+/- 0.018763264291258446) || Overall Micro: 0.5069679849340866 (+/- 0.001810794935495775)


# 5. Logistic Regression

## 5.1 Only one hot encoded columns

In [38]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[13713  5005  3407]
 [ 3692 13660  4773]
 [ 2483  9225 10417]]
Macro: 0.5702164719042462 || Micro: 0.5693408662900188


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[13896  4878  3351]
 [ 3766 13591  4768]
 [ 2519  9340 10266]]
Macro: 0.5693748221395779 || Micro: 0.5687834274952919


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[13692  4972  3461]
 [ 3777 13445  4903]
 [ 2635  9214 10276]]
Macro: 0.5643532127708527 || Micro: 0.5636610169491525


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[13705  4962  3458]
 [ 3678 13601  4846]
 [ 2494  9308 10323]]
Macro: 0.5677656467931936 || Micro: 0.5669152542372882
[[13782  5008  3335]
 [ 3866 13400  4859]
 [ 2537  9227 10361]]
Macro: 0.5664842549905044 || Micro: 0.5656195856873822
---------------------------
Overall Macro: 0.5676388817196749 (+/- 0.002088172071927255) || Overall Micro: 0.5668640301318266 (+/- 0.002079873213735806)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.2 Only one hot encoded columns with frequencies

In [39]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_more, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11827  6326  3972]
 [ 2406 14733  4986]
 [ 1706 10218 10201]]
Macro: 0.5557969223408928 || Micro: 0.5538380414312618


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[12155  7732  2238]
 [ 2520 16145  3460]
 [ 1981 12588  7556]]
Macro: 0.5350391137765506 || Micro: 0.5402033898305085


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11430  7787  2908]
 [ 2251 15965  3909]
 [ 1658 12148  8319]]
Macro: 0.5356638960616394 || Micro: 0.5380640301318267


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11626  7944  2555]
 [ 2221 16271  3633]
 [ 1751 12500  7874]]
Macro: 0.5348769764451782 || Micro: 0.5389227871939737
[[11828  7831  2466]
 [ 2340 16114  3671]
 [ 1877 12312  7936]]
Macro: 0.5367501517625918 || Micro: 0.5405348399246704
---------------------------
Overall Macro: 0.5396254120773706 (+/- 0.008112462613547394) || Overall Micro: 0.5423126177024482 (+/- 0.005830714799971811)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.3 Only with frequencies

In [40]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_frequency, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[12197  5974  3954]
 [ 2935  7623 11567]
 [ 2262  6701 13162]]
Macro: 0.49825353788267734 || Micro: 0.4969039548022599
[[12415  5871  3839]
 [ 3023  7598 11504]
 [ 2178  6729 13218]]
Macro: 0.5018026542728498 || Micro: 0.5006553672316384


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[12247  6198  3680]
 [ 2909  8074 11142]
 [ 2240  7100 12785]]
Macro: 0.501723878746676 || Micro: 0.49877212806026366


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[12296  6036  3793]
 [ 2885  7583 11657]
 [ 2192  6865 13068]]
Macro: 0.49821038865520845 || Micro: 0.49637664783427493
[[12309  9816     0]
 [ 2901 19221     3]
 [ 2268 19849     8]]
Macro: 0.38789837564061375 || Micro: 0.47514877589453863
---------------------------
Overall Macro: 0.47757776703960503 (+/- 0.0448675064794449) || Overall Micro: 0.4935713747645951 (+/- 0.009333794297556083)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.4 Scaled one hot encoded pids and frequencies

In [41]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_more_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[13720  5005  3400]
 [ 3691 13661  4773]
 [ 2484  9224 10417]]
Macro: 0.5703339303051748 || Micro: 0.5694613935969868


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[13879  4896  3350]
 [ 3765 13591  4769]
 [ 2520  9338 10267]]
Macro: 0.5691490154170724 || Micro: 0.5685423728813559


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[13684  4972  3469]
 [ 3772 13450  4903]
 [ 2633  9215 10277]]
Macro: 0.5643282000069204 || Micro: 0.5636308851224106


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[13711  4971  3443]
 [ 3676 13606  4843]
 [ 2490  9317 10318]]
Macro: 0.567857650657729 || Micro: 0.5670056497175141
[[13795  4988  3342]
 [ 3865 13401  4859]
 [ 2535  9229 10361]]
Macro: 0.5666832748549141 || Micro: 0.5658305084745763
---------------------------
Overall Macro: 0.5676704142483622 (+/- 0.0020717372850723864) || Overall Micro: 0.5668941619585687 (+/- 0.0020542658783662834)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.5 Frequencies normalized

In [42]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_confusion_matrix(log_reg, X_train_one_hot_frequency_scaled, y_train)

[[12197  3887  6041]
 [ 2935  3857 15333]
 [ 2263  3171 16691]]
Macro: 0.46844735389672226 || Micro: 0.4933333333333334
[[12415  3724  5986]
 [ 3024  3782 15319]
 [ 2179  3183 16763]]
Macro: 0.4707501800159725 || Micro: 0.49657250470809794
[[12247  3873  6005]
 [ 2909  3963 15253]
 [ 2240  3242 16643]]
Macro: 0.47100383602936685 || Micro: 0.49496045197740113
[[12296  3810  6019]
 [ 2886  3775 15464]
 [ 2192  3213 16720]]
Macro: 0.468741382994613 || Micro: 0.4940263653483992
[[12309  3912  5904]
 [ 2903  3886 15336]
 [ 2271  3190 16664]]
Macro: 0.47048362208498135 || Micro: 0.4950508474576271
---------------------------
Overall Macro: 0.46988527500433125 (+/- 0.0010708281576815313) || Overall Micro: 0.49478870056497176 (+/- 0.001093565317974913)


# 6. XGBoost

## 6.1 Only one hot encoded columns

In [20]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xg, X_train_one_hot, y_train)

[[14896  4674  2555]
 [ 2284 15267  4574]
 [ 1628  9986 10511]]
Macro: 0.6143616905352153 || Micro: 0.6127909604519775
[[15080  4436  2609]
 [ 2382 15129  4614]
 [ 1616 10154 10355]]
Macro: 0.6124147027130493 || Micro: 0.6111337099811676
[[14994  4521  2610]
 [ 2456 14949  4720]
 [ 1670 10134 10321]]
Macro: 0.6079999242708163 || Micro: 0.6066139359698681
[[15006  4488  2631]
 [ 2198 15287  4640]
 [ 1599 10200 10326]]
Macro: 0.6133447121125338 || Micro: 0.6119623352165725
[[15139  4508  2478]
 [ 2489 15026  4610]
 [ 1708 10128 10289]]
Macro: 0.6105041232195602 || Micro: 0.6094764595103578
---------------------------
Overall Macro: 0.611725030570235 (+/- 0.002254685052723252) || Overall Micro: 0.6103954802259887 (+/- 0.00218554429297643)


## 6.2 Only one hot encoded columns with frequencies

In [22]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xg, X_train_one_hot_more, y_train)

[[15801  4028  2296]
 [ 2224 15223  4678]
 [ 1562  9908 10655]]
Macro: 0.6291149443018684 || Micro: 0.6279322033898305
[[15978  3734  2413]
 [ 2352 15095  4678]
 [ 1567 10124 10434]]
Macro: 0.6259402641993891 || Micro: 0.6253408662900188
[[15760  3927  2438]
 [ 2419 14945  4761]
 [ 1648 10093 10384]]
Macro: 0.6197996292589966 || Micro: 0.6190433145009416
[[15867  3846  2412]
 [ 2253 15201  4671]
 [ 1541 10163 10421]]
Macro: 0.6258654894003866 || Micro: 0.6250696798493409
[[16063  3753  2309]
 [ 2469 14959  4697]
 [ 1597 10030 10498]]
Macro: 0.6260940083177626 || Micro: 0.6255367231638418
---------------------------
Overall Macro: 0.6253628670956807 (+/- 0.003038034979123631) || Overall Micro: 0.6245845574387948 (+/- 0.0029538474269806578)


## 6.3 Only with frequencies

In [23]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xg, X_train_one_hot_frequency, y_train)

[[14012  3456  4657]
 [ 3269  7183 11673]
 [ 2375  5263 14487]]
Macro: 0.5319326203976592 || Micro: 0.5375819209039548
[[14258  3397  4470]
 [ 3539  6860 11726]
 [ 2426  5289 14410]]
Macro: 0.5280421546945703 || Micro: 0.5352617702448211
[[14147  3416  4562]
 [ 3435  7063 11627]
 [ 2514  5257 14354]]
Macro: 0.5294389560607274 || Micro: 0.5358041431261771
[[14084  3375  4666]
 [ 3234  7048 11843]
 [ 2425  5220 14480]]
Macro: 0.5304164806847768 || Micro: 0.5365273069679849
[[14203  3227  4695]
 [ 3345  6644 12136]
 [ 2515  4914 14696]]
Macro: 0.5270523332512148 || Micro: 0.535487758945386
---------------------------
Overall Macro: 0.5293765090177897 (+/- 0.0017205487811413102) || Overall Micro: 0.5361325800376647 (+/- 0.0008412339790338856)


## 6.4 Scaled one hot encoded pids and frequencies

In [24]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xg, X_train_one_hot_more_scaled, y_train)

[[15801  4028  2296]
 [ 2224 15223  4678]
 [ 1562  9908 10655]]
Macro: 0.6291149443018684 || Micro: 0.6279322033898305
[[15978  3734  2413]
 [ 2352 15095  4678]
 [ 1567 10124 10434]]
Macro: 0.6259402641993891 || Micro: 0.6253408662900188
[[15760  3927  2438]
 [ 2419 14945  4761]
 [ 1648 10093 10384]]
Macro: 0.6197996292589966 || Micro: 0.6190433145009416
[[15867  3846  2412]
 [ 2253 15201  4671]
 [ 1541 10163 10421]]
Macro: 0.6258654894003866 || Micro: 0.6250696798493409
[[16063  3753  2309]
 [ 2469 14959  4697]
 [ 1597 10030 10498]]
Macro: 0.6260940083177626 || Micro: 0.6255367231638418
---------------------------
Overall Macro: 0.6253628670956807 (+/- 0.003038034979123631) || Overall Micro: 0.6245845574387948 (+/- 0.0029538474269806578)


## 6.5 Frequencies normalized

In [25]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_confusion_matrix(xg, X_train_one_hot_frequency_scaled, y_train)

[[14012  3456  4657]
 [ 3269  7183 11673]
 [ 2375  5263 14487]]
Macro: 0.5319326203976592 || Micro: 0.5375819209039548
[[14258  3397  4470]
 [ 3537  6864 11724]
 [ 2426  5289 14410]]
Macro: 0.5281175573756202 || Micro: 0.5353220338983051
[[14147  3416  4562]
 [ 3436  7064 11625]
 [ 2516  5257 14352]]
Macro: 0.529425890358636 || Micro: 0.535789077212806
[[14084  3375  4666]
 [ 3234  7047 11844]
 [ 2423  5220 14482]]
Macro: 0.5304276728350086 || Micro: 0.536542372881356
[[14203  3227  4695]
 [ 3344  6645 12136]
 [ 2515  4914 14696]]
Macro: 0.5270724636709253 || Micro: 0.5355028248587571
---------------------------
Overall Macro: 0.5293952409275698 (+/- 0.001704870613815671) || Overall Micro: 0.5361476459510358 (+/- 0.0008293316625799685)
