In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold # for cross validation
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics # for f1 macro in cross validation

import xgboost as xgb

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
from helper import util_visualizations, util_ml

# 0. Load Data

## 0.1 Training

In [2]:
# read data
train = pd.read_csv('/path/to/9_FINAL/data/machine_learning/two_class/one-hot-ecoding/train/one_hot_train_up.csv', sep=";")
del train['Unnamed: 0']

In [3]:
# get label
y_train = train['label']

In [4]:
##### get variables
X_train = train.copy()
del X_train['label']
X_train.shape

(1142330, 65)

In [5]:
X_train

Unnamed: 0,instance,class,frequency,pidspread,pldspread,id,pids,p1,p10,p11,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,lattimer,name,2,2,2,377124302,"['p21a', 'p8a']",0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,bethphage,place,7,3,7,14506363,"['p8a', 'p8b', 'p10']",0,1,0,...,0,0,0,0,0,0,1,1,0,0
2,winter,convention,8,4,7,21547293,"['p2p8a', 'p2', 'p8b', 'p5', 'p5', 'p5']",0,0,0,...,0,0,0,1,0,0,1,1,0,0
3,sheepskin,sheep,2,2,2,17107854,"['p8a', 'p3a']",0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,hexapetalum,species,1,1,1,139090198,['p5'],0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1142325,agon,company,2,2,2,10250288,"['p8a', 'p5']",0,0,0,...,0,0,0,1,0,0,1,0,0,0
1142326,geoda,software,4,2,4,295062544,"['p5', 'p5', 'p5', 'p8a']",0,0,0,...,0,0,0,1,0,0,1,0,0,0
1142327,haute-marne,department,4,1,4,313309813,"['p8a', 'p8a']",0,0,0,...,0,0,0,0,0,0,1,0,0,0
1142328,cannabis,product,96,10,78,345295809,"['p8a', 'p5p8a', 'p5p4p15ap8a', 'p1p8ap3a', 'p...",1,0,0,...,1,0,0,1,0,0,1,1,0,0


In [6]:
# only take top 10 patterns
X_train_top_10 = X_train[['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10']]
X_train_top_10.head()

Unnamed: 0,p8a,p5,p1,p3a,p8b,p4,p2,p8c,p20a,p10
0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,1
2,1,1,0,0,1,0,1,0,0,0
3,1,0,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0


In [7]:
# only take one hot encoded columns
X_train_one_hot = X_train.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_train_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [8]:
# only take one hot encoded columns
X_train_one_hot_more = X_train.drop(['instance', 'class', 'id', 'pids'], axis=1) 
X_train_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,2,2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,7,3,7,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,8,4,7,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
3,2,2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [9]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_train_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot_more))
X_train_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,1.5e-05,0.02,5.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,9.3e-05,0.04,0.000308,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,0.000108,0.06,0.000308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1.5e-05,0.02,5.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# only take one hot encoded columns
X_train_one_hot_frequency = X_train[['frequency', 'pidspread', 'pldspread']]
X_train_one_hot_frequency.head()

Unnamed: 0,frequency,pidspread,pldspread
0,2,2,2
1,7,3,7
2,8,4,7
3,2,2,2
4,1,1,1


In [11]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_train_one_hot_frequency_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_one_hot_frequency))
X_train_one_hot_frequency_scaled.head()

Unnamed: 0,0,1,2
0,1.5e-05,0.02,5.1e-05
1,9.3e-05,0.04,0.000308
2,0.000108,0.06,0.000308
3,1.5e-05,0.02,5.1e-05
4,0.0,0.0,0.0


## 0.2 Testing

In [12]:
# read test data
test = pd.read_csv('/path/to/9_FINAL/data/machine_learning/two_class/one-hot-ecoding/test/one_hot_test.csv', sep=";")
del test['Unnamed: 0']

In [13]:
# get label
y_test = test['label']

In [14]:
# get variables
X_test = test.copy()
del X_test['label']
X_test.shape

(348121, 65)

In [15]:
# only take top 10 patterns
X_test_top_10 = X_test[['p8a', 'p5', 'p1', 'p3a', 'p8b', 'p4', 'p2', 'p8c', 'p20a', 'p10']]
X_test_top_10.head()

Unnamed: 0,p8a,p5,p1,p3a,p8b,p4,p2,p8c,p20a,p10
0,0,1,0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0


In [16]:
# only take one hot encoded columns
X_test_one_hot = X_test.drop(['instance', 'class', 'frequency', 'pidspread', 'pldspread', 'id', 'pids'], axis=1)
X_test_one_hot.head()

Unnamed: 0,p1,p10,p11,p12a,p12b,p12c,p13,p14,p15a,p15b,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# only take one hot encoded columns
X_test_one_hot_more = X_test.drop(['instance', 'class',  'id', 'pids'], axis=1)
X_test_one_hot_more.head()

Unnamed: 0,frequency,pidspread,pldspread,p1,p10,p11,p12a,p12b,p12c,p13,...,p4,p42,p43,p5,p6,p7,p8a,p8b,p8c,p8d
0,2,2,2,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,11,3,10,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,36,3,33,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,13,2,13,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_more_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_more))
X_test_one_hot_more_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,1.5e-05,0.020833,0.000132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.000155,0.041667,0.00119,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.000541,0.041667,0.004232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0.000185,0.020833,0.001587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# only take one hot encoded columns
X_test_one_hot_frequency = X_test[['frequency', 'pidspread', 'pldspread']]
X_test_one_hot_frequency.head()

Unnamed: 0,frequency,pidspread,pldspread
0,2,2,2
1,11,3,10
2,1,1,1
3,36,3,33
4,13,2,13


In [20]:
# scale
min_max_scaler = preprocessing.MinMaxScaler()
X_test_one_hot_frequency_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_test_one_hot_frequency))
X_test_one_hot_frequency_scaled.head()

Unnamed: 0,0,1,2
0,1.5e-05,0.020833,0.000132
1,0.000155,0.041667,0.00119
2,0.0,0.0,0.0
3,0.000541,0.041667,0.004232
4,0.000185,0.020833,0.001587


# 1. Naive Bayes

## 1.1 Only one hot encoded columns

In [39]:
def get_cross_validated_two_class_confusion_matrix(model, X_train, y_train, folds=5):
    """Get cross validated confusion matrix for two class problem"""
    # k-fold
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=88)
    kf.get_n_splits(X_train.values)
    
    overall_precision = []
    overall_recall = []
    overall_f1 = []
    
    # negative scores
    overall_negative_precision = []
    overall_negative_recall = []
    overall_negative_f1 = []
    
    for train_index, test_index in kf.split(X_train.values, y_train):
        # print("TRAIN:", train_index, "TEST:", test_index) # for debugging
        X_tr, X_te = X_train.values[train_index], X_train.values[test_index]
        y_tr, y_te = y_train[train_index], y_train[test_index]
        
        # fit model
        model.fit(X_tr, y_tr)
        
        # predicted score
        y_predict = model.predict(X_te)
        f1 = f1_score(y_te, y_predict)
        precision = precision_score(y_te, y_predict)
        recall = recall_score(y_te, y_predict)
        conf_matrix = confusion_matrix(y_te, y_predict)
        print(conf_matrix)
        print('Positive')
        print('Precision: {} || Recall: {} || F1: {}'.format(precision, recall, f1))
        
        overall_precision.append(precision)
        overall_recall.append(recall)
        overall_f1.append(f1)
        
        # for negative class (noise), flip labels
        
        precision = conf_matrix[0,0] / (conf_matrix[0,0] + conf_matrix[1,0])
        recall = conf_matrix[0,0] / (conf_matrix[0,0] + conf_matrix[0,1])
        f1 = (2*precision*recall)/(precision+recall)
        
        print('Negative')
        print('Precision: {} || Recall: {} || F1: {}'.format(precision, recall, f1))
        
        overall_negative_precision.append(precision)
        overall_negative_recall.append(recall)
        overall_negative_f1.append(f1)
    
    print('---------------------------')
    print('Positive')
    print('Overall Precision: {} (+/- {}) || Overall Recall: {} (+/- {}) || Overall F1: {} (+/- {})'.format(
                                                                            np.mean(overall_precision), np.std(overall_precision, axis=0),
                                                                           np.mean(overall_recall), np.std(overall_recall, axis=0),
                                                                            np.mean(overall_f1), np.std(overall_f1, axis=0)))
    
    print('Negative')
    print('Overall Precision: {} (+/- {}) || Overall Recall: {} (+/- {}) || Overall F1: {} (+/- {})'.format(
                                                                            np.mean(overall_negative_precision), np.std(overall_negative_precision, axis=0),
                                                                           np.mean(overall_negative_recall), np.std(overall_negative_recall, axis=0),
                                                                            np.mean(overall_negative_f1), np.std(overall_negative_f1, axis=0)))

In [21]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_one_hot, y_train)

[[16611  6236]
 [11783 11064]]
Positive
Precision: 0.6395375722543353 || Recall: 0.4842648925460673 || F1: 0.5511744339552146
Negative
Precision: 0.5850179615411707 || Recall: 0.7270538801593207 || F1: 0.6483480025760621
[[16474  6373]
 [11893 10953]]
Positive
Precision: 0.63217130324368 || Recall: 0.4794274708920599 || F1: 0.5453051876929205
Negative
Precision: 0.5807452321359325 || Recall: 0.7210574692519806 || F1: 0.6433397117975553
[[16654  6193]
 [11937 10909]]
Positive
Precision: 0.6378786106888084 || Recall: 0.47750153199684847 || F1: 0.5461600080104135
Negative
Precision: 0.5824909936693365 || Recall: 0.7289359653346172 || F1: 0.6475368404681364
[[16538  6308]
 [11774 11073]]
Positive
Precision: 0.6370749669178989 || Recall: 0.48465881735019917 || F1: 0.5505120811375162
Negative
Precision: 0.5841339361401526 || Recall: 0.7238903965683271 || F1: 0.6465459947613277
[[16589  6257]
 [11928 10919]]
Positive
Precision: 0.6357126222636237 || Recall: 0.47791832625727665 || F1: 0.545636

## 1.2 Only one hot encoded columns with frequencies

In [22]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_one_hot_more, y_train)

[[22328   519]
 [21257  1590]]
Positive
Precision: 0.7539118065433855 || Recall: 0.06959338206329059 || F1: 0.12742426670940857
Negative
Precision: 0.5122863370425605 || Recall: 0.9772836696283976 || F1: 0.6722061657032756
[[22292   555]
 [21271  1575]]
Positive
Precision: 0.7394366197183099 || Recall: 0.06893985818086316 || F1: 0.12612107623318386
Negative
Precision: 0.5117186603310149 || Recall: 0.9757079704118703 || F1: 0.6713446770064749
[[22278   569]
 [21273  1573]]
Positive
Precision: 0.734360410830999 || Recall: 0.0688523155038081 || F1: 0.12590043220745958
Negative
Precision: 0.5115381965970931 || Recall: 0.9750951984943319 || F1: 0.6710443085635109
[[22300   546]
 [21275  1572]]
Positive
Precision: 0.7422096317280453 || Recall: 0.06880553245502692 || F1: 0.12593631083516924
Negative
Precision: 0.5117613310384395 || Recall: 0.9761008491639674 || F1: 0.6714743831017298
[[22303   543]
 [21233  1614]]
Positive
Precision: 0.7482614742698191 || Recall: 0.07064384820764213 || F1: 0.

## 1.3 Only with frequencies

In [23]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_one_hot_frequency, y_train)

[[22386   461]
 [21521  1326]]
Positive
Precision: 0.7420257414661444 || Recall: 0.05803825447542347 || F1: 0.10765608508565398
Negative
Precision: 0.5098503655453572 || Recall: 0.9798222961439138 || F1: 0.6707013811906403
[[22331   516]
 [21510  1336]]
Positive
Precision: 0.7213822894168467 || Recall: 0.05847850827278298 || F1: 0.10818689772451211
Negative
Precision: 0.5093633813097329 || Recall: 0.9774149778964415 || F1: 0.6697156909788868
[[22335   512]
 [21508  1338]]
Positive
Precision: 0.7232432432432433 || Recall: 0.05856605094983804 || F1: 0.10835762876579204
Negative
Precision: 0.5094313801519057 || Recall: 0.9775900555871668 || F1: 0.6698155645524068
[[22339   507]
 [21491  1356]]
Positive
Precision: 0.7278582930756844 || Recall: 0.05935133715586292 || F1: 0.10975313638203157
Negative
Precision: 0.5096737394478668 || Recall: 0.9778079313665412 || F1: 0.6700761893334934
[[22335   511]
 [21452  1395]]
Positive
Precision: 0.7318992654774397 || Recall: 0.06105834464043419 || F1: 

## 1.4 Scaled one hot encoded pids and frequencies

In [24]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_one_hot_more_scaled, y_train)

[[16610  6237]
 [11780 11067]]
Positive
Precision: 0.6395631067961165 || Recall: 0.4843962008141113 || F1: 0.5512689596772186
Negative
Precision: 0.5850651637900669 || Recall: 0.7270101107366393 || F1: 0.6483595838944511
[[16493  6354]
 [11907 10939]]
Positive
Precision: 0.6325680911351413 || Recall: 0.4788146721526744 || F1: 0.5450559306410224
Negative
Precision: 0.5807394366197183 || Recall: 0.7218890882829255 || F1: 0.6436669463578355
[[16648  6199]
 [11934 10912]]
Positive
Precision: 0.6377184267430308 || Recall: 0.47763284601243106 || F1: 0.5461871511875266
Negative
Precision: 0.5824644881393884 || Recall: 0.7286733487985293 || F1: 0.6474168270819965
[[16545  6301]
 [11771 11076]]
Positive
Precision: 0.6373942567761984 || Recall: 0.4847901256182431 || F1: 0.5507159904534606
Negative
Precision: 0.5842986297499647 || Recall: 0.7241967959380198 || F1: 0.6467690864313357
[[16591  6255]
 [11931 10916]]
Positive
Precision: 0.6357230213732455 || Recall: 0.47778701798923273 || F1: 0.54555

## 1.5 Frequencies normalized

In [25]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_one_hot_frequency_scaled, y_train)

[[19739  3108]
 [18604  4243]]
Positive
Precision: 0.5772003809005577 || Recall: 0.18571366043681883 || F1: 0.28101198754884427
Negative
Precision: 0.5148006154969617 || Recall: 0.8639646343064735 || F1: 0.6451707795391404
[[ 8887 13960]
 [ 9857 12989]]
Positive
Precision: 0.48198448922037923 || Recall: 0.5685459161341154 || F1: 0.5216989657596145
Negative
Precision: 0.47412505335040545 || Recall: 0.3889788593688449 || F1: 0.42735207136159264
[[ 8831 14016]
 [ 9832 13014]]
Positive
Precision: 0.48146503884572694 || Recall: 0.5696401995973037 || F1: 0.5218541984120619
Negative
Precision: 0.47318223222418687 || Recall: 0.3865277716986913 || F1: 0.42548783425680553
[[19703  3143]
 [18757  4090]]
Positive
Precision: 0.5654638462601963 || Recall: 0.17901693876657768 || F1: 0.27194148936170215
Negative
Precision: 0.5122984919396776 || Recall: 0.8624266830079664 || F1: 0.6427755847714742
[[19748  3098]
 [18538  4309]]
Positive
Precision: 0.5817469960847846 || Recall: 0.18860244233378562 || F1

## 1.6 Top 10 patterns

In [26]:
# fit NB model
nb = MultinomialNB()
util_ml.get_cross_validated_two_class_confusion_matrix(nb, X_train_top_10, y_train)

[[17113  5734]
 [12895  9952]]
Positive
Precision: 0.6344511028943006 || Recall: 0.43559329452444523 || F1: 0.5165442607635014
Negative
Precision: 0.5702812583311117 || Recall: 0.7490261303453407 || F1: 0.6475451707501655
[[14315  8532]
 [11951 10895]]
Positive
Precision: 0.5608174190559531 || Recall: 0.476888733257463 || F1: 0.515459040049204
Negative
Precision: 0.5450011421609685 || Recall: 0.6265592856830219 || F1: 0.5829413800826665
[[14484  8363]
 [12031 10815]]
Positive
Precision: 0.5639274168317864 || Recall: 0.47338702617526046 || F1: 0.5147058823529411
Negative
Precision: 0.5462568357533472 || Recall: 0.633956318116164 || F1: 0.58684818281269
[[17075  5771]
 [12862  9985]]
Positive
Precision: 0.6337268342218837 || Recall: 0.4370376854729286 || F1: 0.517317306945056
Negative
Precision: 0.5703644319738117 || Recall: 0.7473956053576118 || F1: 0.6469886137582177
[[17129  5717]
 [12900  9947]]
Positive
Precision: 0.6350229826353422 || Recall: 0.43537444741103865 || F1: 0.5165796785

# 2. Decision Trees

## 2.1 Only one hot encoded columns

In [27]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_one_hot, y_train)

[[11215 11632]
 [ 5896 16951]]
Positive
Precision: 0.5930448168491761 || Recall: 0.7419354838709677 || F1: 0.6591872447987557
Negative
Precision: 0.6554263339372334 || Recall: 0.49087407537094585 || F1: 0.5613394063766955
[[11281 11566]
 [ 6006 16840]]
Positive
Precision: 0.5928325001760192 || Recall: 0.7371093408036418 || F1: 0.6571450870209944
Negative
Precision: 0.6525712963498582 || Recall: 0.49376285726791264 || F1: 0.5621667414162556
[[11225 11622]
 [ 5797 17049]]
Positive
Precision: 0.5946426702940253 || Recall: 0.746257550555896 || F1: 0.6618786031795328
Negative
Precision: 0.6594407237692398 || Recall: 0.491311769597759 || F1: 0.5630941332865134
[[11263 11583]
 [ 5893 16954]]
Positive
Precision: 0.5941058976066159 || Recall: 0.7420667921390117 || F1: 0.6598941304686283
Negative
Precision: 0.6565050128235019 || Recall: 0.49299658583559486 || F1: 0.5631218439078045
[[11350 11496]
 [ 5782 17065]]
Positive
Precision: 0.5974930849760163 || Recall: 0.7469251980566376 || F1: 0.663904

## 2.2 Only one hot encoded columns with frequencies

In [28]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_one_hot_more, y_train)

[[11457 11390]
 [ 6488 16359]]
Positive
Precision: 0.5895347580092977 || Recall: 0.7160239856436293 || F1: 0.6466519092418372
Negative
Precision: 0.6384508219559766 || Recall: 0.501466275659824 || F1: 0.5617277897626985
[[11532 11315]
 [ 6486 16360]]
Positive
Precision: 0.5911472448057814 || Recall: 0.7160990983104263 || F1: 0.6476514716652482
Negative
Precision: 0.64002664002664 || Recall: 0.5047489823609227 || F1: 0.5643949590113789
[[11602 11245]
 [ 6602 16244]]
Positive
Precision: 0.5909272800029103 || Recall: 0.7110216230412326 || F1: 0.6454355816032582
Negative
Precision: 0.6373324544056251 || Recall: 0.5078128419486146 || F1: 0.5652481060144698
[[11595 11251]
 [ 6528 16319]]
Positive
Precision: 0.591911498005078 || Recall: 0.7142732087363768 || F1: 0.6473610091834104
Negative
Precision: 0.6397947359708658 || Recall: 0.5075286702267355 || F1: 0.5660377358490566
[[11749 11097]
 [ 6646 16201]]
Positive
Precision: 0.5934867023225144 || Recall: 0.7091084168599816 || F1: 0.64616611825

## 2.3 Only with frequencies

In [29]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_one_hot_frequency, y_train)

[[16471  6376]
 [13580  9267]]
Positive
Precision: 0.5924055488077734 || Recall: 0.40561123998774457 || F1: 0.4815276695245518
Negative
Precision: 0.548101560680177 || Recall: 0.7209261609839366 || F1: 0.6227456614616809
[[16169  6678]
 [13446  9400]]
Positive
Precision: 0.5846498320686653 || Recall: 0.4114505821588024 || F1: 0.4829924982016237
Negative
Precision: 0.5459733243288873 || Recall: 0.7077077953341795 || F1: 0.6164080667912012
[[16150  6697]
 [13470  9376]]
Positive
Precision: 0.5833385180115722 || Recall: 0.41040007003414164 || F1: 0.4818212184280171
Negative
Precision: 0.5452397029034436 || Recall: 0.7068761763032345 || F1: 0.615625059561248
[[16512  6334]
 [13852  8995]]
Positive
Precision: 0.586796268510666 || Recall: 0.3937059570184269 || F1: 0.4712384744341995
Negative
Precision: 0.5438018706362798 || Recall: 0.7227523417666112 || F1: 0.6206352189438075
[[16355  6491]
 [13538  9309]]
Positive
Precision: 0.5891772151898734 || Recall: 0.4074495557403598 || F1: 0.48174502

## 2.4 Scaled one hot encoded pids and frequencies

In [30]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_one_hot_more_scaled, y_train)

[[11452 11395]
 [ 6491 16356]]
Positive
Precision: 0.5893841663363483 || Recall: 0.7158926773755854 || F1: 0.6465077671054191
Negative
Precision: 0.6382433260881681 || Recall: 0.5012474285464175 || F1: 0.5615101740622701
[[11534 11313]
 [ 6482 16364]]
Positive
Precision: 0.5912490515590563 || Recall: 0.7162741836645364 || F1: 0.6477841775033152
Negative
Precision: 0.640208703374778 || Recall: 0.5048365212062853 || F1: 0.5645204708415927
[[11600 11247]
 [ 6605 16241]]
Positive
Precision: 0.5908396391152503 || Recall: 0.71089030902565 || F1: 0.6453292009377358
Negative
Precision: 0.6371875858280692 || Recall: 0.507725303103252 || F1: 0.5651368995420443
[[11597 11249]
 [ 6529 16318]]
Positive
Precision: 0.5919396379729387 || Recall: 0.7142294393136954 || F1: 0.6473598603562503
Negative
Precision: 0.6397991834933245 || Recall: 0.5076162129037906 || F1: 0.5660939177975202
[[11746 11100]
 [ 6647 16200]]
Positive
Precision: 0.5934065934065934 || Recall: 0.7090646474373004 || F1: 0.64610046463

## 2.5 Frequencies normalized

In [31]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_one_hot_frequency_scaled, y_train)

[[16475  6372]
 [13575  9272]]
Positive
Precision: 0.5926872922526208 || Recall: 0.40583008710115115 || F1: 0.4817749603803486
Negative
Precision: 0.5482529118136439 || Recall: 0.7211012386746619 || F1: 0.6229086715692762
[[16167  6680]
 [13448  9398]]
Positive
Precision: 0.584525438487374 || Recall: 0.41136303948174735 || F1: 0.48288973384030415
Negative
Precision: 0.5459057909842985 || Recall: 0.7076202564888169 || F1: 0.6163318211276734
[[16154  6693]
 [13470  9376]]
Positive
Precision: 0.5834837264297716 || Recall: 0.41040007003414164 || F1: 0.4818707439290762
Negative
Precision: 0.54530110721037 || Recall: 0.7070512539939598 || F1: 0.6157305940424235
[[16521  6325]
 [13848  8999]]
Positive
Precision: 0.5872487601148525 || Recall: 0.3938810347091522 || F1: 0.47150978491524975
Negative
Precision: 0.5440086930751753 || Recall: 0.723146283813359 || F1: 0.6209151555012684
[[16355  6491]
 [13542  9305]]
Positive
Precision: 0.5890731830843251 || Recall: 0.4072744780496345 || F1: 0.481587

## 2.6 Top 10 patterns

In [32]:
# fit DT model
dt = DecisionTreeClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(dt, X_train_top_10, y_train)

[[10676 12171]
 [ 5474 17373]]
Positive
Precision: 0.588038180341186 || Recall: 0.7604061802424826 || F1: 0.6632055123971674
Negative
Precision: 0.6610526315789473 || Recall: 0.46728235654571715 || F1: 0.5475292971254199
[[10574 12273]
 [ 5361 17485]]
Positive
Precision: 0.5875730895893542 || Recall: 0.7653418541539 || F1: 0.6647783438521786
Negative
Precision: 0.6635707561970505 || Recall: 0.462817875432223 || F1: 0.5453045227167244
[[10745 12102]
 [ 5441 17405]]
Positive
Precision: 0.5898600332124581 || Recall: 0.7618401470716974 || F1: 0.664909365270376
Negative
Precision: 0.6638452984060299 || Recall: 0.4703024467107279 || F1: 0.5505597827479312
[[10741 12105]
 [ 5479 17368]]
Positive
Precision: 0.5892851084043023 || Recall: 0.760187333129076 || F1: 0.6639143730886851
Negative
Precision: 0.6622071516646116 || Recall: 0.47014794712422303 || F1: 0.5498899298622844
[[10797 12049]
 [ 5398 17449]]
Positive
Precision: 0.5915316292630008 || Recall: 0.7637326563662625 || F1: 0.666692138695

# 3. Random Forest

## 3.1 Only one hot encoded columns

In [33]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_one_hot, y_train)

[[11289 11558]
 [ 5420 17427]]
Positive
Precision: 0.6012420217353803 || Recall: 0.7627697290672736 || F1: 0.6724417348356228
Negative
Precision: 0.6756239152552517 || Recall: 0.49411301264936314 || F1: 0.5707857215087471
[[11295 11552]
 [ 5416 17430]]
Positive
Precision: 0.6014077703402112 || Recall: 0.7629344305348857 || F1: 0.6726094003241491
Negative
Precision: 0.6759021004129017 || Recall: 0.494375629185451 || F1: 0.5710602153799484
[[11240 11607]
 [ 5265 17581]]
Positive
Precision: 0.602336576675346 || Recall: 0.7695439026525431 || F1: 0.6757504708459853
Negative
Precision: 0.6810057558315662 || Recall: 0.4919683109379787 || F1: 0.5712543199837365
[[11271 11575]
 [ 5327 17520]]
Positive
Precision: 0.6021653205018044 || Recall: 0.7668402853766358 || F1: 0.6745985907358206
Negative
Precision: 0.6790577177973249 || Recall: 0.4933467565438151 || F1: 0.5714937633100091
[[11369 11477]
 [ 5289 17558]]
Positive
Precision: 0.6047184432581367 || Recall: 0.7685035234385258 || F1: 0.67684360

## 3.2 Only one hot encoded columns with frequencies

In [34]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_one_hot_more, y_train)

[[11557 11290]
 [ 5833 17014]]
Positive
Precision: 0.6011164499717354 || Recall: 0.7446929574998906 || F1: 0.6652460362456257
Negative
Precision: 0.6645773433007476 || Recall: 0.5058432179279555 || F1: 0.5744464050500783
[[11592 11255]
 [ 5729 17117]]
Positive
Precision: 0.6033060764133653 || Recall: 0.7492340015757681 || F1: 0.6683978288882815
Negative
Precision: 0.669245424629063 || Recall: 0.5073751477218016 || F1: 0.5771758613821948
[[11749 11098]
 [ 5830 17016]]
Positive
Precision: 0.6052500533542008 || Recall: 0.7448130963844874 || F1: 0.6678178963893251
Negative
Precision: 0.6683542863644121 || Recall: 0.514246947082768 || F1: 0.5812595854153267
[[11684 11162]
 [ 5745 17102]]
Positive
Precision: 0.605080667987546 || Recall: 0.7485446666958463 || F1: 0.6692101504568488
Negative
Precision: 0.6703769579436571 || Recall: 0.5114243193556859 || F1: 0.5802110490378647
[[11861 10985]
 [ 5944 16903]]
Positive
Precision: 0.6061029833620195 || Recall: 0.7398345515822646 || F1: 0.6663250221

## 3.3 Only with frequencies

In [35]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_one_hot_frequency, y_train)

[[16229  6618]
 [13152  9695]]
Positive
Precision: 0.5943112854778397 || Recall: 0.42434455289534734 || F1: 0.49514811031664957
Negative
Precision: 0.5523637725060413 || Recall: 0.7103339606950584 || F1: 0.6214674121161063
[[16049  6798]
 [13107  9739]]
Positive
Precision: 0.588921811694987 || Recall: 0.42628906591963583 || F1: 0.4945788792118427
Negative
Precision: 0.5504527370009603 || Recall: 0.7024554646124218 || F1: 0.617233621137242
[[15996  6851]
 [13070  9776]]
Positive
Precision: 0.5879593432369038 || Recall: 0.4279086054451545 || F1: 0.49532591898259565
Negative
Precision: 0.5503337232505333 || Recall: 0.7001356852103121 || F1: 0.6162618226648432
[[16340  6506]
 [13470  9377]]
Positive
Precision: 0.5903796511993956 || Recall: 0.4104258764826892 || F1: 0.4842241156726052
Negative
Precision: 0.5481382086548138 || Recall: 0.7152236715398756 || F1: 0.6206320267395928
[[16185  6661]
 [13148  9699]]
Positive
Precision: 0.5928484107579463 || Recall: 0.42451963058607256 || F1: 0.4947

## 3.4 Scaled one hot encoded pids and frequencies

In [36]:
# fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_one_hot_more_scaled, y_train)

[[11563 11284]
 [ 5839 17008]]
Positive
Precision: 0.6011593383288563 || Recall: 0.7444303409638027 || F1: 0.6651674846985667
Negative
Precision: 0.6644638547293414 || Recall: 0.5061058344640434 || F1: 0.5745732813237595
[[11596 11251]
 [ 5738 17108]]
Positive
Precision: 0.6032652773370006 || Recall: 0.7488400595290204 || F1: 0.668215994531784
Negative
Precision: 0.6689742702203761 || Recall: 0.5075502254125268 || F1: 0.5771882232896145
[[11752 11095]
 [ 5830 17016]]
Positive
Precision: 0.6053146455124329 || Recall: 0.7448130963844874 || F1: 0.667857212944247
Negative
Precision: 0.6684108747582755 || Recall: 0.5143782553508119 || F1: 0.5813648618565881
[[11684 11162]
 [ 5738 17109]]
Positive
Precision: 0.6051784514166461 || Recall: 0.7488510526546155 || F1: 0.6693923862435933
Negative
Precision: 0.6706463092641488 || Recall: 0.5114243193556859 || F1: 0.580311910201649
[[11857 10989]
 [ 5943 16904]]
Positive
Precision: 0.6060301867852149 || Recall: 0.739878321004946 || F1: 0.66629877808

## 3.5 Frequencies normalized

In [37]:
## fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_one_hot_frequency_scaled, y_train)

[[16231  6616]
 [13151  9696]]
Positive
Precision: 0.594409024031388 || Recall: 0.42438832231802864 || F1: 0.4952118286983835
Negative
Precision: 0.5524130419985025 || Recall: 0.7104214995404211 || F1: 0.6215320990254456
[[16044  6803]
 [13099  9747]]
Positive
Precision: 0.5889425981873112 || Recall: 0.4266392366278561 || F1: 0.49482180932074327
Negative
Precision: 0.5505267131043475 || Recall: 0.7022366174990152 || F1: 0.6171956145412579
[[16001  6846]
 [13073  9773]]
Positive
Precision: 0.5880618569107647 || Recall: 0.42777729142957194 || F1: 0.49527429367794246
Negative
Precision: 0.5503542684185183 || Recall: 0.7003545323237187 || F1: 0.6163594691935826
[[16349  6497]
 [13470  9377]]
Positive
Precision: 0.5907143757087061 || Recall: 0.4104258764826892 || F1: 0.4843366648588621
Negative
Precision: 0.5482745900264931 || Recall: 0.7156176135866235 || F1: 0.6208677489793981
[[16187  6659]
 [13143  9704]]
Positive
Precision: 0.5930452850944203 || Recall: 0.42473847769947914 || F1: 0.494

## 3.6 Top 10 patterns

In [38]:
## fit random forest model
rf = RandomForestClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(rf, X_train_top_10, y_train)

[[10637 12210]
 [ 5424 17423]]
Positive
Precision: 0.5879593696217056 || Recall: 0.7625946513765484 || F1: 0.6639862804878048
Negative
Precision: 0.6622875287964635 || Recall: 0.4655753490611459 || F1: 0.5467770124396011
[[10557 12290]
 [ 5338 17508]]
Positive
Precision: 0.587556211826297 || Recall: 0.7663485949400333 || F1: 0.6651470253020287
Negative
Precision: 0.6641711229946524 || Recall: 0.4620737952466407 || F1: 0.5449899334056064
[[10712 12135]
 [ 5402 17444]]
Positive
Precision: 0.5897427228777173 || Recall: 0.7635472292742712 || F1: 0.6654840247973296
Negative
Precision: 0.6647635596375823 || Recall: 0.4688580557622445 || F1: 0.5498832165498833
[[10717 12129]
 [ 5451 17396]]
Positive
Precision: 0.5891955969517358 || Recall: 0.7614128769641528 || F1: 0.664324448178416
Negative
Precision: 0.6628525482434439 || Recall: 0.4690974349995623 || F1: 0.5493925257599835
[[10781 12065]
 [ 5370 17477]]
Positive
Precision: 0.5915984022747275 || Recall: 0.7649582002013393 || F1: 0.667201130

# 4. Neural Network

## 4.1 Only one hot encoded columns

In [39]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_one_hot, y_train)

[[11105 11742]
 [ 5218 17629]]
Positive
Precision: 0.6002179020121889 || Recall: 0.7716111524488992 || F1: 0.6752077827569039
Negative
Precision: 0.6803283710102309 || Recall: 0.4860594388760012 || F1: 0.5670155731427112
[[11185 11662]
 [ 5309 17537]]
Positive
Precision: 0.6006027603685058 || Recall: 0.7676179637573317 || F1: 0.6739168027668364
Negative
Precision: 0.6781253789256699 || Recall: 0.4895609926905064 || F1: 0.5686179812409445
[[11423 11424]
 [ 5381 17465]]
Positive
Precision: 0.6045553670947419 || Recall: 0.7644664273833494 || F1: 0.6751715473083985
Negative
Precision: 0.6797786241371102 || Recall: 0.4999781152886593 || F1: 0.5761771455953191




[[13860  8986]
 [ 8242 14605]]
Positive
Precision: 0.6190920266203213 || Recall: 0.6392524182606032 || F1: 0.6290107239760541
Negative
Precision: 0.6270925708080717 || Recall: 0.6066707519915959 || F1: 0.6167126457239477




[[11507 11339]
 [ 5379 17468]]
Positive
Precision: 0.6063803936543202 || Recall: 0.7645642753972075 || F1: 0.6763464591319163
Negative
Precision: 0.6814520904891627 || Recall: 0.5036767924363127 || F1: 0.5792308466727071
---------------------------
Positive
Overall Precision: 0.6061696899500155 (+/- 0.006870978154166128) || Overall Recall: 0.7415024474494782 (+/- 0.05119143494880295) || Overall F1: 0.6659306631880219 (+/- 0.018475973742967455)
Negative
Overall Precision: 0.6693554070740491 (+/- 0.021158633588476156) || Overall Recall: 0.517189218256615 (+/- 0.04520649203091241) || Overall F1: 0.5815508384751259 (+/- 0.018161191374227)


## 4.2 Only one hot encoded columns with frequencies

In [40]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_one_hot_more, y_train)

[[11468 11379]
 [ 5281 17566]]
Positive
Precision: 0.6068751079633788 || Recall: 0.7688536788199763 || F1: 0.6783286994130369
Negative
Precision: 0.6846975938862022 || Recall: 0.5019477393093185 || F1: 0.5792504293362966
[[15978  6869]
 [10097 12749]]
Positive
Precision: 0.6498623712916709 || Recall: 0.5580407948875077 || F1: 0.6004615674453655
Negative
Precision: 0.612770853307766 || Recall: 0.6993478356020484 || F1: 0.6532030579289481
[[11450 11397]
 [ 5100 17746]]
Positive
Precision: 0.6089283876059431 || Recall: 0.7767661735095859 || F1: 0.6826828752236049
Negative
Precision: 0.6918429003021148 || Recall: 0.5011598897010549 || F1: 0.5812625326801533
[[11765 11081]
 [ 5481 17366]]
Positive
Precision: 0.6104685907125532 || Recall: 0.7600997942837134 || F1: 0.6771162319179632
Negative
Precision: 0.6821871738374116 || Recall: 0.514969797776416 || F1: 0.586900129701686
[[12341 10505]
 [ 5926 16921]]
Positive
Precision: 0.6169692992051338 || Recall: 0.7406224011905282 || F1: 0.6731645217

## 4.3 Only with frequencies

In [41]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_one_hot_frequency, y_train)

[[15835  7012]
 [12422 10425]]
Positive
Precision: 0.5978666054940643 || Recall: 0.45629623145270715 || F1: 0.5175752159666369
Negative
Precision: 0.5603921152280851 || Recall: 0.6930888081586204 || F1: 0.619716656230432
[[15260  7587]
 [11966 10880]]
Positive
Precision: 0.5891590404505334 || Recall: 0.47623216317955 || F1: 0.5267107205964223
Negative
Precision: 0.5604936457797693 || Recall: 0.6679213901168644 || F1: 0.6095101152317616
[[15845  7002]
 [12661 10185]]
Positive
Precision: 0.5925990574271252 || Recall: 0.44581108290291516 || F1: 0.5088302150725651
Negative
Precision: 0.5558478916719287 || Recall: 0.6935265023854336 || F1: 0.6171012404338598
[[15706  7140]
 [12637 10210]]
Positive
Precision: 0.5884726224783862 || Recall: 0.4468858055762244 || F1: 0.5079981093116401
Negative
Precision: 0.5541403521151607 || Recall: 0.6874726429134203 || F1: 0.6136474633221981
[[16028  6818]
 [12703 10144]]
Positive
Precision: 0.5980426836457965 || Recall: 0.4439970236792577 || F1: 0.50963349

## 4.4 Scaled one hot encoded pids and frequencies

In [42]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_one_hot_more_scaled, y_train)

[[11464 11383]
 [ 5572 17275]]
Positive
Precision: 0.6027985204829367 || Recall: 0.7561167768197138 || F1: 0.6708086593534608
Negative
Precision: 0.6729279173514909 || Recall: 0.5017726616185932 || F1: 0.574881528470777




[[11897 10950]
 [ 6068 16778]]
Positive
Precision: 0.6050923254472014 || Recall: 0.7343955178149347 || F1: 0.6635029857238897
Negative
Precision: 0.6622321180072362 || Recall: 0.5207248216396025 || F1: 0.5830147995687541




[[11531 11316]
 [ 5489 17357]]
Positive
Precision: 0.6053430056150385 || Recall: 0.7597391228223759 || F1: 0.6738096624546284
Negative
Precision: 0.677497062279671 || Recall: 0.5047052129382413 || F1: 0.5784734241352497
[[11512 11334]
 [ 5534 17313]]
Positive
Precision: 0.6043564771180229 || Recall: 0.7577800148816037 || F1: 0.6724278556725055
Negative
Precision: 0.6753490554968907 || Recall: 0.5038956491289504 || F1: 0.5771583274842074
[[11826 11020]
 [ 5679 17168]]
Positive
Precision: 0.6090534979423868 || Recall: 0.751433448592813 || F1: 0.672793181150191
Negative
Precision: 0.6755784061696658 || Recall: 0.5176398494265955 || F1: 0.5861564769150702
---------------------------
Positive
Overall Precision: 0.6053287653211172 (+/- 0.0020628728991827854) || Overall Recall: 0.7518929761862883 (+/- 0.00916986312321478) || Overall F1: 0.670668468870935 (+/- 0.0037105456046080316)
Negative
Overall Precision: 0.6727169118609909 (+/- 0.005439514560998009) || Overall Recall: 0.5097476389503965 

## 4.5 Frequencies normalized

In [43]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_one_hot_frequency_scaled, y_train)

[[17162  5685]
 [14366  8481]]
Positive
Precision: 0.5986869970351546 || Recall: 0.3712084737602311 || F1: 0.45827141815037953
Negative
Precision: 0.5443415376807916 || Recall: 0.7511708320567252 || F1: 0.6312459770114943
[[17168  5679]
 [14428  8418]]
Positive
Precision: 0.597148329431794 || Recall: 0.3684671277247658 || F1: 0.45572909617518875
Negative
Precision: 0.5433599189770857 || Recall: 0.751433448592813 || F1: 0.6306779567621181
[[18860  3987]
 [16061  6785]]
Positive
Precision: 0.6298737467508355 || Recall: 0.2969885319093058 || F1: 0.40365280504491646
Negative
Precision: 0.5400761719309298 || Recall: 0.8254913117695978 || F1: 0.652956654203019
[[18895  3951]
 [16303  6544]]
Positive
Precision: 0.623535016674607 || Recall: 0.28642710202652427 || F1: 0.39253794013556476
Negative
Precision: 0.5368202738791977 || Recall: 0.8270594414777204 || F1: 0.6510578182068776
[[19700  3146]
 [17080  5767]]
Positive
Precision: 0.6470324245484125 || Recall: 0.25241826060314265 || F1: 0.36316

## 4.6 Top 10 patterns

In [44]:
# fit nnet
nnet = MLPClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(nnet, X_train_top_10, y_train)

[[10468 12379]
 [ 5240 17607]]
Positive
Precision: 0.5871740145401187 || Recall: 0.7706482251499103 || F1: 0.6665152461529725
Negative
Precision: 0.6664120193531958 || Recall: 0.4581783166280037 || F1: 0.5430164699779536
[[10448 12399]
 [ 5191 17655]]
Positive
Precision: 0.5874426033140348 || Recall: 0.7727829817035805 || F1: 0.6674858223062382
Negative
Precision: 0.6680734062280197 || Recall: 0.4573029281743774 || F1: 0.5429506833653797
[[10571 12276]
 [ 5237 17609]]
Positive
Precision: 0.5892253638949305 || Recall: 0.7707695001313141 || F1: 0.6678803739735639
Negative
Precision: 0.6687120445344129 || Recall: 0.4626865671641791 || F1: 0.5469408873366963
[[10555 12291]
 [ 5256 17591]]
Positive
Precision: 0.5886821497891708 || Recall: 0.7699479143870093 || F1: 0.6672229702820079
Negative
Precision: 0.6675732085257099 || Recall: 0.4620064781581021 || F1: 0.5460847970613343
[[10771 12075]
 [ 5300 17547]]
Positive
Precision: 0.5923637836742961 || Recall: 0.7680220597890314 || F1: 0.6688520

# 5. Logistic Regression

## 5.1 Only one hot encoded columns

In [45]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_one_hot, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11500 11347]
 [ 5541 17306]]
Positive
Precision: 0.6039856210518968 || Recall: 0.7574736289228345 || F1: 0.6720776699029126
Negative
Precision: 0.674843025644035 || Recall: 0.5033483608351206 || F1: 0.5766145206578419


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11728 11119]
 [ 5716 17130]]
Positive
Precision: 0.6063931466600587 || Recall: 0.7498030289766261 || F1: 0.6705157060377728
Negative
Precision: 0.6723228617289613 || Recall: 0.5133277892064604 || F1: 0.5821647514333226


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11701 11146]
 [ 5643 17203]]
Positive
Precision: 0.6068291650499136 || Recall: 0.752998336689136 || F1: 0.6720578181463034
Negative
Precision: 0.6746425276752768 || Recall: 0.5121460147940649 || F1: 0.5822696623622204


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11527 11319]
 [ 5577 17270]]
Positive
Precision: 0.6040784917275875 || Recall: 0.7558979297063072 || F1: 0.6715141146278871
Negative
Precision: 0.6739359214218896 || Recall: 0.5045522192068633 || F1: 0.5770713391739675
[[11707 11139]
 [ 5603 17244]]
Positive
Precision: 0.6075467709544445 || Recall: 0.754759924716593 || F1: 0.6731992972867461
Negative
Precision: 0.67631426920855 || Recall: 0.5124310601418192 || F1: 0.5830760035860146
---------------------------
Positive
Overall Precision: 0.6057666390887803 (+/- 0.0014637082615679314) || Overall Recall: 0.7541865698022994 (+/- 0.0026339921921769696) || Overall F1: 0.6718729212003244 (+/- 0.0008724858266309528)
Negative
Overall Precision: 0.6744117211357425 (+/- 0.0012999002573974314) || Overall Recall: 0.5091610888368656 (+/- 0.004289367630870331) || Overall F1: 0.5802392554426734 (+/- 0.002794695791802736)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.2 Only one hot encoded columns with frequencies

In [46]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_one_hot_more, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11638 11209]
 [ 5655 17192]]
Positive
Precision: 0.6053307982113306 || Recall: 0.7524839147371646 || F1: 0.6709334998438964
Negative
Precision: 0.6729890707222576 || Recall: 0.509388541165142 || F1: 0.5798704534130542


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11628 11219]
 [ 5654 17192]]
Positive
Precision: 0.6051177360881349 || Recall: 0.7525168519653331 || F1: 0.6708156934662582
Negative
Precision: 0.6728387918065039 || Recall: 0.5089508469383289 || F1: 0.5795310124847367


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11631 11216]
 [ 5566 17280]]
Positive
Precision: 0.6064008983717013 || Recall: 0.756368729755756 || F1: 0.6731331073974524
Negative
Precision: 0.6763388963191255 || Recall: 0.5090821552063728 || F1: 0.5809109979023074


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11591 11255]
 [ 5658 17189]]
Positive
Precision: 0.6043102235972437 || Recall: 0.7523526064691207 || F1: 0.670254040669903
Negative
Precision: 0.6719809844048931 || Recall: 0.5073535848726254 || F1: 0.5781768300286819
[[11641 11205]
 [ 5518 17329]]
Positive
Precision: 0.6073105768556809 || Recall: 0.7584803256445047 || F1: 0.6745294953387438
Negative
Precision: 0.6784194883151698 || Recall: 0.509542151799002 || F1: 0.5819772528433945
---------------------------
Positive
Overall Precision: 0.6056940466248183 (+/- 0.0010480323273261676) || Overall Recall: 0.7544404857143758 (+/- 0.0025269035706283077) || Overall F1: 0.6719331673432507 (+/- 0.0016277791345285417)
Negative
Overall Precision: 0.67451344631359 (+/- 0.0024548313107572326) || Overall Recall: 0.5088634559962941 (+/- 0.0007837754472404327) || Overall F1: 0.5800933093344349 (+/- 0.0012850364335095568)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5.3 Only with frequencies

In [47]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_one_hot_frequency, y_train)

[[18775  4072]
 [17339  5508]]
Positive
Precision: 0.5749478079331941 || Recall: 0.2410819801286821 || F1: 0.33971690258118237
Negative
Precision: 0.5198814864041645 || Recall: 0.821770910841686 || F1: 0.6368616543138684
[[18805  4042]
 [17358  5488]]
Positive
Precision: 0.5758656873032529 || Recall: 0.24021710583909656 || F1: 0.33901655547319
Negative
Precision: 0.5200066366175372 || Recall: 0.8230839935221255 || F1: 0.6373496017624131
[[18747  4100]
 [17402  5444]]
Positive
Precision: 0.5704107292539815 || Recall: 0.23829116694388514 || F1: 0.3361531336832355
Negative
Precision: 0.5186035574981327 || Recall: 0.8205453670066092 || F1: 0.6355346125161028
[[18746  4100]
 [17280  5567]]
Positive
Precision: 0.5758766939071066 || Recall: 0.24366437606687968 || F1: 0.3424371040167313
Negative
Precision: 0.5203464164769889 || Recall: 0.8205375120371181 || F1: 0.6368392444625629
[[18730  4116]
 [17418  5429]]
Positive
Precision: 0.5687794656888423 || Recall: 0.23762419573685822 || F1: 0.33520

## 5.4 Scaled one hot encoded pids and frequencies

In [48]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_one_hot_more_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11500 11347]
 [ 5542 17305]]
Positive
Precision: 0.6039717995253385 || Recall: 0.7574298595001532 || F1: 0.6720518845026117
Negative
Precision: 0.6748034268278371 || Recall: 0.5033483608351206 || F1: 0.576600065180877


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11734 11113]
 [ 5716 17130]]
Positive
Precision: 0.606521970045675 || Recall: 0.7498030289766261 || F1: 0.6705944528176319
Negative
Precision: 0.6724355300859599 || Recall: 0.5135904057425482 || F1: 0.582375859245105


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11598 11249]
 [ 5514 17332]]
Positive
Precision: 0.606416850355131 || Recall: 0.7586448393591876 || F1: 0.6740428179749937
Negative
Precision: 0.6777699859747546 || Recall: 0.5076377642578894 || F1: 0.5804950073825671


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11534 11312]
 [ 5586 17261]]
Positive
Precision: 0.6041017744024079 || Recall: 0.7555040049021753 || F1: 0.6713730066122131
Negative
Precision: 0.673714953271028 || Recall: 0.5048586185765561 || F1: 0.5771906120202173


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[11710 11136]
 [ 5599 17248]]
Positive
Precision: 0.6076662908680946 || Recall: 0.7549350024073183 || F1: 0.6733423122718666
Negative
Precision: 0.6765266624299497 || Recall: 0.5125623741574017 || F1: 0.5832399452123024
---------------------------
Positive
Overall Precision: 0.6057357370393295 (+/- 0.0014553642311881635) || Overall Recall: 0.7552633470290921 (+/- 0.003036743858226065) || Overall F1: 0.6722808948358633 (+/- 0.0012610926660551176)
Negative
Overall Precision: 0.675050111717906 (+/- 0.0019108875739651573) || Overall Recall: 0.5083995047139032 (+/- 0.004072029300900211) || Overall F1: 0.5799802978082138 (+/- 0.002677209784588974)


## 5.5 Frequencies normalized

In [49]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_one_hot_frequency_scaled, y_train)

[[18772  4075]
 [17309  5538]]
Positive
Precision: 0.576094871528139 || Recall: 0.24239506280912154 || F1: 0.3412199630314233
Negative
Precision: 0.5202738283307003 || Recall: 0.8216396025736421 || F1: 0.637116481129514
[[18798  4049]
 [17325  5521]]
Positive
Precision: 0.5769070010449321 || Recall: 0.24166156001050512 || F1: 0.34063425468904246
Negative
Precision: 0.5203886720372062 || Recall: 0.8227776075633563 || F1: 0.6375445141597423
[[18725  4122]
 [17336  5510]]
Positive
Precision: 0.5720514950166113 || Recall: 0.24118007528670227 || F1: 0.33930660754972597
Negative
Precision: 0.5192590333046782 || Recall: 0.8195824397076202 || F1: 0.6357370815508929
[[18722  4124]
 [17213  5634]]
Positive
Precision: 0.577372412379586 || Recall: 0.24659692738652778 || F1: 0.3455911669989265
Negative
Precision: 0.5209962432169194 || Recall: 0.8194869999124573 || F1: 0.637008557186846
[[18689  4157]
 [17348  5499]]
Positive
Precision: 0.5694904722452361 || Recall: 0.24068805532455026 || F1: 0.3383

## 5.6 Top 10 patterns

In [50]:
# fit LogReg
log_reg = LogisticRegression(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(log_reg, X_train_top_10, y_train)

[[11606 11241]
 [ 6547 16300]]
Positive
Precision: 0.5918448858066155 || Recall: 0.7134415897054318 || F1: 0.6469794395490991
Negative
Precision: 0.6393433592243706 || Recall: 0.50798791963934 || F1: 0.5661463414634146
[[11689 11158]
 [ 6563 16283]]
Positive
Precision: 0.593382165372982 || Recall: 0.7127287052438064 || F1: 0.6476027601567006
Negative
Precision: 0.6404229673460443 || Recall: 0.5116207817218891 || F1: 0.5688216258303123
[[11530 11317]
 [ 6364 16482]]
Positive
Precision: 0.5928990251447894 || Recall: 0.7214392016107852 || F1: 0.6508836015401324
Negative
Precision: 0.6443500614731195 || Recall: 0.5046614435155601 || F1: 0.5660145799072188
[[11594 11252]
 [ 6543 16304]]
Positive
Precision: 0.5916678763245754 || Recall: 0.713616667396157 || F1: 0.6469456183163701
Negative
Precision: 0.6392457407509511 || Recall: 0.507484898888208 || F1: 0.5657955737744919
[[11629 11217]
 [ 6403 16444]]
Positive
Precision: 0.5944832074039261 || Recall: 0.7197443865715412 || F1: 0.651144373168

# 6. XGBoost

## 6.1 Only one hot encoded columns

In [51]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot, y_train)

[[11388 11459]
 [ 5219 17628]]
Positive
Precision: 0.606043937154055 || Recall: 0.7715673830262179 || F1: 0.6788616320714753
Negative
Precision: 0.6857349310531704 || Recall: 0.49844618549481334 || F1: 0.5772798702286206
[[11419 11428]
 [ 5298 17548]]
Positive
Precision: 0.6056046383213695 || Recall: 0.7680994484811345 || F1: 0.6772413260777277
Negative
Precision: 0.683077107136448 || Recall: 0.4998030375979341 || F1: 0.5772419371145486
[[11457 11390]
 [ 5219 17627]]
Positive
Precision: 0.6074714822345522 || Recall: 0.7715573842248096 || F1: 0.6797524246572701
Negative
Precision: 0.6870352602542577 || Recall: 0.501466275659824 || F1: 0.5797636819067379
[[11457 11389]
 [ 5225 17622]]
Positive
Precision: 0.6074247699148598 || Recall: 0.77130476649013 || F1: 0.6796251301631379
Negative
Precision: 0.6867881548974943 || Recall: 0.5014882255099361 || F1: 0.5796903460837887
[[11544 11302]
 [ 5173 17674]]
Positive
Precision: 0.6099530646051905 || Recall: 0.7735807764695584 || F1: 0.68209096347

## 6.2 Only one hot encoded columns with frequencies

In [52]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot_more, y_train)

[[11583 11264]
 [ 5349 17498]]
Positive
Precision: 0.6083721577080871 || Recall: 0.765877358077647 || F1: 0.6780987812203298
Negative
Precision: 0.6840892983699504 || Recall: 0.5069812229176697 || F1: 0.5823675808843862
[[11590 11257]
 [ 5326 17520]]
Positive
Precision: 0.6088195433853425 || Recall: 0.7668738510023636 || F1: 0.6787672161633381
Negative
Precision: 0.6851501537006385 || Recall: 0.5072876088764389 || F1: 0.5829540024646027
[[11593 11254]
 [ 5223 17623]]
Positive
Precision: 0.6102780759774215 || Recall: 0.7713822988706994 || F1: 0.6814376582951491
Negative
Precision: 0.6894029495718363 || Recall: 0.5074189171444828 || F1: 0.5845750447520359
[[11601 11245]
 [ 5290 17557]]
Positive
Precision: 0.6095757239080619 || Recall: 0.7684597540158445 || F1: 0.6798582741195377
Negative
Precision: 0.6868154638564916 || Recall: 0.5077912982579007 || F1: 0.5838890706394544
[[11781 11065]
 [ 5314 17533]]
Positive
Precision: 0.6130848311070705 || Recall: 0.767409287871493 || F1: 0.681621148

## 6.3 Only with frequencies

In [53]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot_frequency, y_train)

[[16109  6738]
 [12700 10147]]
Positive
Precision: 0.6009475866153391 || Recall: 0.4441283319473016 || F1: 0.5107721735628712
Negative
Precision: 0.5591655385469818 || Recall: 0.7050816299733007 || F1: 0.6237029580300449
[[16117  6730]
 [12793 10053]]
Positive
Precision: 0.5989989870702497 || Recall: 0.44003326621728095 || F1: 0.5073557243432839
Negative
Precision: 0.5574887582151504 || Recall: 0.7054317853547512 || F1: 0.6227949842533377
[[15857  6990]
 [12675 10171]]
Positive
Precision: 0.5926810791911893 || Recall: 0.44519828416352974 || F1: 0.5084610193216187
Negative
Precision: 0.5557619514930604 || Recall: 0.6940517354576093 || F1: 0.6172560773857023
[[16110  6736]
 [12987  9860]]
Positive
Precision: 0.5941190648349 || Recall: 0.4315665076377643 || F1: 0.49996197043835416
Negative
Precision: 0.5536653263223013 || Recall: 0.7051562636785433 || F1: 0.6202953237202318
[[15740  7106]
 [12396 10451]]
Positive
Precision: 0.5952611493991 || Recall: 0.4574342364424213 || F1: 0.5173250173

## 6.4 Scaled one hot encoded pids and frequencies

In [54]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot_more_scaled, y_train)

[[11583 11264]
 [ 5349 17498]]
Positive
Precision: 0.6083721577080871 || Recall: 0.765877358077647 || F1: 0.6780987812203298
Negative
Precision: 0.6840892983699504 || Recall: 0.5069812229176697 || F1: 0.5823675808843862
[[11590 11257]
 [ 5326 17520]]
Positive
Precision: 0.6088195433853425 || Recall: 0.7668738510023636 || F1: 0.6787672161633381
Negative
Precision: 0.6851501537006385 || Recall: 0.5072876088764389 || F1: 0.5829540024646027
[[11595 11252]
 [ 5223 17623]]
Positive
Precision: 0.6103203463203464 || Recall: 0.7713822988706994 || F1: 0.6814640088165349
Negative
Precision: 0.6894398858366036 || Recall: 0.5075064559898455 || F1: 0.584646413714862
[[11601 11245]
 [ 5290 17557]]
Positive
Precision: 0.6095757239080619 || Recall: 0.7684597540158445 || F1: 0.6798582741195377
Negative
Precision: 0.6868154638564916 || Recall: 0.5077912982579007 || F1: 0.5838890706394544
[[11781 11065]
 [ 5314 17533]]
Positive
Precision: 0.6130848311070705 || Recall: 0.767409287871493 || F1: 0.6816211487

## 6.5 Frequencies normalized

In [55]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot_frequency_scaled, y_train)

[[16109  6738]
 [12700 10147]]
Positive
Precision: 0.6009475866153391 || Recall: 0.4441283319473016 || F1: 0.5107721735628712
Negative
Precision: 0.5591655385469818 || Recall: 0.7050816299733007 || F1: 0.6237029580300449
[[16117  6730]
 [12793 10053]]
Positive
Precision: 0.5989989870702497 || Recall: 0.44003326621728095 || F1: 0.5073557243432839
Negative
Precision: 0.5574887582151504 || Recall: 0.7054317853547512 || F1: 0.6227949842533377
[[15857  6990]
 [12675 10171]]
Positive
Precision: 0.5926810791911893 || Recall: 0.44519828416352974 || F1: 0.5084610193216187
Negative
Precision: 0.5557619514930604 || Recall: 0.6940517354576093 || F1: 0.6172560773857023
[[16110  6736]
 [12988  9859]]
Positive
Precision: 0.59409460680928 || Recall: 0.43152273821508297 || F1: 0.4999239389483293
Negative
Precision: 0.5536462987146883 || Recall: 0.7051562636785433 || F1: 0.6202833821038041
[[15740  7106]
 [12397 10450]]
Positive
Precision: 0.5952380952380952 || Recall: 0.45739046701974 || F1: 0.51728832

## 6.6 Top 10 patterns

In [56]:
# fit xgboost
xg = xgb.XGBClassifier(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(xg, X_train_top_10, y_train)

[[10628 12219]
 [ 5407 17440]]
Positive
Precision: 0.5880171280218484 || Recall: 0.7633387315621307 || F1: 0.664305031805889
Negative
Precision: 0.6628001247271593 || Recall: 0.46518142425701403 || F1: 0.5466796975464225
[[10549 12298]
 [ 5326 17520]]
Positive
Precision: 0.5875645583204776 || Recall: 0.7668738510023636 || F1: 0.6653501443111044
Negative
Precision: 0.664503937007874 || Recall: 0.46172363986519016 || F1: 0.5448582201332577
[[10710 12137]
 [ 5379 17467]]
Positive
Precision: 0.5900216187001757 || Recall: 0.7645539700604045 || F1: 0.6660438512869399
Negative
Precision: 0.6656721983964199 || Recall: 0.4687705169168819 || F1: 0.5501335524964043
[[10697 12149]
 [ 5427 17420]]
Positive
Precision: 0.5891305083026143 || Recall: 0.7624633431085044 || F1: 0.6646825396825397
Negative
Precision: 0.6634209873480525 || Recall: 0.46822200822901167 || F1: 0.548986399794714
[[10741 12105]
 [ 5325 17522]]
Positive
Precision: 0.5914199885239815 || Recall: 0.7669278242219986 || F1: 0.6678354

## 6.7 Grid Search

In [23]:
# initialize parameters
learning_rate = [0.05, 0.15, 0.25]
max_depth = [3,8,12]
min_child_weight = [1,3,5]
gamma = [0, 0.2, 0.4]
colsample_bytree = [0.3, 0.5, 0.7]

parameters = []

# loop through all parameters
for rate in learning_rate:
    for depth in max_depth:
        for weight in min_child_weight:
            for gam in gamma:
                for bytree in colsample_bytree:
                    parameter_combination = [rate, depth, weight, gam, bytree]
                    
                    parameters.append(parameter_combination)

In [27]:
def get_cross_validated_two_class_confusion_matrix(model, X_train, y_train, folds=5):
    """Get cross validated confusion matrix for two class problem"""
    # k-fold
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=88)
    kf.get_n_splits(X_train.values)
    
    overall_precision = []
    overall_recall = []
    overall_f1 = []
    
    for train_index, test_index in kf.split(X_train.values, y_train):
        # print("TRAIN:", train_index, "TEST:", test_index) # for debugging
        X_tr, X_te = X_train.values[train_index], X_train.values[test_index]
        y_tr, y_te = y_train[train_index], y_train[test_index]
        
        # fit model
        model.fit(X_tr, y_tr)
        
        # predicted score
        y_predict = model.predict(X_te)
        f1 = f1_score(y_te, y_predict)
        precision = precision_score(y_te, y_predict)
        recall = recall_score(y_te, y_predict)
        
        print(confusion_matrix(y_te, y_predict))
        print('Precision: {} || Recall: {} || F1: {}'.format(precision, recall, f1))
        
        overall_precision.append(precision)
        overall_recall.append(recall)
        overall_f1.append(f1)
    
    print('---------------------------')
    print('Overall Precision: {} (+/- {}) || Overall Recall: {} (+/- {}) || Overall F1: {} (+/- {})'.format(
                                                                            np.mean(overall_precision), np.std(overall_precision, axis=0),
                                                                           np.mean(overall_recall), np.std(overall_recall, axis=0),
                                                                            np.mean(overall_f1), np.std(overall_f1, axis=0)))
    
    return np.mean(overall_f1)

In [30]:
# loop through parameter combination and train model
max_f1 = 0
# best parameter combination
best_parameters = None

for parameter in parameters:
    print(parameter)
    
    xg = xgb.XGBClassifier(random_state=88, learning_rate=parameter[0], 
                          max_depth=parameter[1], min_child_weight=parameter[2], gamma=parameter[3],
                          colsample_bytree=parameter[4])
    f1 = get_cross_validated_two_class_confusion_matrix(xg, X_train_one_hot, y_train)
    
    if f1 > max_f1:
        best_parameters=parameter
        max_f1=f1

[0.05, 3, 1, 0, 0.3]
[[11769 11078]
 [ 5877 16970]]
Precision: 0.605034227039361 || Recall: 0.7427671029019127 || F1: 0.6668631496217703
[[11883 10964]
 [ 5901 16945]]
Precision: 0.6071518148267584 || Recall: 0.7417053313490326 || F1: 0.6677174662594818
[[11733 11114]
 [ 5742 17104]]
Precision: 0.6061379261464314 || Recall: 0.7486649741749103 || F1: 0.6699044336518878
[[11780 11066]
 [ 5892 16955]]
Precision: 0.6050819028585703 || Recall: 0.742110561561693 || F1: 0.6666273492175827
[[11835 11011]
 [ 5758 17089]]
Precision: 0.6081494661921708 || Recall: 0.7479756642009892 || F1: 0.6708540247708403
---------------------------
Overall Precision: 0.6063110674126584 (+/- 0.0012047996534534592) || Overall Recall: 0.7446447268377075 (+/- 0.003028037103460308) || Overall F1: 0.6683932847043127 (+/- 0.0016885132455071055)
[0.05, 3, 1, 0, 0.5]
[[11583 11264]
 [ 5755 17092]]
Precision: 0.6027648469459727 || Recall: 0.7481069724690331 || F1: 0.6676171318086831
[[10996 11851]
 [ 5040 17806]]
Precis

[[11583 11264]
 [ 5755 17092]]
Precision: 0.6027648469459727 || Recall: 0.7481069724690331 || F1: 0.6676171318086831
[[10996 11851]
 [ 5040 17806]]
Precision: 0.6003978824560812 || Recall: 0.7793924538212379 || F1: 0.6782850503780736
[[11612 11235]
 [ 5650 17196]]
Precision: 0.6048327529809011 || Recall: 0.7526919373194433 || F1: 0.6707100649413967
[[11650 11196]
 [ 5805 17042]]
Precision: 0.6035129966711523 || Recall: 0.7459185013349674 || F1: 0.6672017226191641
[[10956 11890]
 [ 4828 18019]]
Precision: 0.6024607977531846 || Recall: 0.788681227294612 || F1: 0.68310713473349
---------------------------
Overall Precision: 0.6027938553614584 (+/- 0.0014509805476144273) || Overall Recall: 0.7629582184478587 (+/- 0.01759582490851951) || Overall F1: 0.6733842208961616 (+/- 0.0062799187042076625)
[0.05, 3, 3, 0, 0.7]
[[11596 11251]
 [ 5781 17066]]
Precision: 0.6026768372355829 || Recall: 0.746968967479319 || F1: 0.6671096864983191
[[10951 11896]
 [ 5016 17830]]
Precision: 0.5998116127295969 

[[11594 11253]
 [ 5785 17062]]
Precision: 0.6025781387956913 || Recall: 0.7467938897885937 || F1: 0.6669793987725265
[[10951 11896]
 [ 5016 17830]]
Precision: 0.5998116127295969 || Recall: 0.7804429659458986 || F1: 0.6783078444799513
[[10928 11919]
 [ 4893 17953]]
Precision: 0.6009975897161222 || Recall: 0.7858268405847851 || F1: 0.6810956409575478
[[11728 11118]
 [ 5873 16974]]
Precision: 0.604228961982059 || Recall: 0.742942180592638 || F1: 0.6664441783309449
[[11702 11144]
 [ 5673 17174]]
Precision: 0.6064693834310333 || Recall: 0.751696065128901 || F1: 0.6713182839831917
---------------------------
Overall Precision: 0.6028171373309006 (+/- 0.0023551306703821196) || Overall Recall: 0.7615403884081633 (+/- 0.01792987703923535) || Overall F1: 0.6728290693048324 (+/- 0.005926569575766732)
[0.05, 3, 5, 0.2, 0.3]
[[11770 11077]
 [ 5878 16969]]
Precision: 0.6050417171789203 || Recall: 0.7427233334792314 || F1: 0.6668500579647495
[[11883 10964]
 [ 5901 16945]]
Precision: 0.607151814826758

[[11746 11101]
 [ 5544 17303]]
Precision: 0.6091747641177299 || Recall: 0.7573423206547906 || F1: 0.6752258492517219
[[11824 11023]
 [ 5634 17212]]
Precision: 0.6095980166460068 || Recall: 0.7533922787358838 || F1: 0.6739100644075098
[[11781 11066]
 [ 5514 17332]]
Precision: 0.6103246707514614 || Recall: 0.7586448393591876 || F1: 0.676449925844977
[[11767 11079]
 [ 5527 17320]]
Precision: 0.6098806295996337 || Recall: 0.7580864008403729 || F1: 0.6759551965031417
[[11814 11032]
 [ 5460 17387]]
Precision: 0.6118090010204441 || Recall: 0.761018952160021 || F1: 0.6783053095618928
---------------------------
Overall Precision: 0.6101574164270552 (+/- 0.0009067359375303993) || Overall Recall: 0.7576969583500512 (+/- 0.0024795895309407826) || Overall F1: 0.6759692691138487 (+/- 0.0014478474381275292)
[0.05, 8, 1, 0.2, 0.5]
[[11324 11523]
 [ 5129 17718]]
Precision: 0.6059300297527445 || Recall: 0.7755066310675363 || F1: 0.6803102442021196
[[11368 11479]
 [ 5190 17656]]
Precision: 0.60600652136

[[11328 11519]
 [ 5115 17732]]
Precision: 0.6062014973847047 || Recall: 0.7761194029850746 || F1: 0.6807171100618066
[[11376 11471]
 [ 5189 17657]]
Precision: 0.6061864872287833 || Recall: 0.7728705243806355 || F1: 0.6794551121714705
[[11409 11438]
 [ 5102 17744]]
Precision: 0.6080460557878145 || Recall: 0.7766786308325309 || F1: 0.6820942569385715
[[11325 11521]
 [ 5072 17775]]
Precision: 0.6067381212452212 || Recall: 0.7780014881603712 || F1: 0.6817789540302629
[[11424 11422]
 [ 5040 17807]]
Precision: 0.6092237161722946 || Recall: 0.7794021096861733 || F1: 0.6838850910208156
---------------------------
Overall Precision: 0.6072791755637637 (+/- 0.0011846890973501493) || Overall Recall: 0.7766144312089571 (+/- 0.002189113571839554) || Overall F1: 0.6815861048445854 (+/- 0.0014749256593403566)
[0.05, 8, 3, 0.2, 0.7]
[[11287 11560]
 [ 5096 17751]]
Precision: 0.6056088158029409 || Recall: 0.7769510220160196 || F1: 0.6806626020936386
[[11235 11612]
 [ 5095 17751]]
Precision: 0.6045363212

[[11238 11609]
 [ 5031 17816]]
Precision: 0.6054715378079865 || Recall: 0.7797960344903051 || F1: 0.6816651362105908
[[11237 11610]
 [ 5109 17737]]
Precision: 0.6043888642791426 || Recall: 0.7763722314628382 || F1: 0.6796696875059874
[[11263 11584]
 [ 4987 17859]]
Precision: 0.6065618313351221 || Recall: 0.781712334763197 || F1: 0.6830882212319991
[[11211 11635]
 [ 5015 17832]]
Precision: 0.6051515254352327 || Recall: 0.7804963452532061 || F1: 0.6817295561417593
[[11344 11502]
 [ 4985 17862]]
Precision: 0.6082958724969351 || Recall: 0.7818094279336456 || F1: 0.6842236310356057
---------------------------
Overall Precision: 0.6059739262708839 (+/- 0.001354859805369198) || Overall Recall: 0.7800372747806386 (+/- 0.001982313690650313) || Overall F1: 0.6820752464251885 (+/- 0.0015307566931052645)
[0.05, 8, 5, 0.4, 0.3]
[[11765 11082]
 [ 5569 17278]]
Precision: 0.6092383638928067 || Recall: 0.7562480850877576 || F1: 0.6748296131388286
[[11826 11021]
 [ 5631 17215]]
Precision: 0.609682674599

[[11849 10998]
 [ 5691 17156]]
Precision: 0.6093627903672657 || Recall: 0.7509082155206372 || F1: 0.67277112213486
[[11874 10973]
 [ 5680 17166]]
Precision: 0.610043000817371 || Recall: 0.7513787971636172 || F1: 0.6733745219182112
[[11904 10943]
 [ 5647 17199]]
Precision: 0.611150593419089 || Recall: 0.7528232513350258 || F1: 0.6746293245469522
[[11802 11044]
 [ 5576 17271]]
Precision: 0.6099593854847254 || Recall: 0.7559416991289885 || F1: 0.6751495250381142
[[11850 10996]
 [ 5490 17357]]
Precision: 0.6121750784749409 || Recall: 0.7597058694795815 || F1: 0.6780078125
---------------------------
Overall Precision: 0.6105381697126784 (+/- 0.0010016219960818883) || Overall Recall: 0.75415156652557 (+/- 0.00328715235314628) || Overall F1: 0.6747864612276275 (+/- 0.0018214705904420355)
[0.05, 12, 1, 0.4, 0.5]
[[11818 11029]
 [ 5592 17255]]
Precision: 0.6100622259934946 || Recall: 0.7552413883660875 || F1: 0.6749330151962606
[[11805 11042]
 [ 5623 17223]]
Precision: 0.6093401733592783 || Re

[[11831 11016]
 [ 5605 17242]]
Precision: 0.6101634935239578 || Recall: 0.7546723858712303 || F1: 0.6747676352607377
[[11797 11050]
 [ 5619 17227]]
Precision: 0.609223043462885 || Recall: 0.7540488488137967 || F1: 0.6739432349431762
[[11819 11028]
 [ 5541 17305]]
Precision: 0.6107718914340169 || Recall: 0.7574630132189443 || F1: 0.6762539322769104
[[11767 11079]
 [ 5530 17317]]
Precision: 0.6098394140019721 || Recall: 0.757955092572329 || F1: 0.6758776808539702
[[11870 10976]
 [ 5451 17396]]
Precision: 0.613139715212181 || Recall: 0.7614128769641528 || F1: 0.6792791737441185
---------------------------
Overall Precision: 0.6106275115270026 (+/- 0.001352106051506952) || Overall Recall: 0.7571104434880905 (+/- 0.0026331030439143395) || Overall F1: 0.6760243314157826 (+/- 0.0018209046806154947)
[0.05, 12, 3, 0.4, 0.7]
[[11386 11461]
 [ 5212 17635]]
Precision: 0.6060970580148474 || Recall: 0.771873768984987 || F1: 0.679013534066188
[[11363 11484]
 [ 5210 17636]]
Precision: 0.60563186813186

[[11369 11478]
 [ 5186 17661]]
Precision: 0.6060949243282199 || Recall: 0.7730117739747012 || F1: 0.6794521601969761
[[11339 11508]
 [ 5170 17676]]
Precision: 0.6056743421052632 || Recall: 0.7737021798126587 || F1: 0.6794541610609264
[[11365 11482]
 [ 5089 17757]]
Precision: 0.6073053113991587 || Recall: 0.7772476582333888 || F1: 0.6818469808966114
[[11341 11505]
 [ 5111 17736]]
Precision: 0.6065456037755207 || Recall: 0.7762944806757999 || F1: 0.6810013822761481
[[11445 11401]
 [ 5056 17791]]
Precision: 0.6094477939161415 || Recall: 0.7787017989232722 || F1: 0.6837564134591364
---------------------------
Overall Precision: 0.6070135951048609 (+/- 0.0013316901085704486) || Overall Recall: 0.7757915783239641 (+/- 0.0021417451656990175) || Overall F1: 0.6811022195779597 (+/- 0.001615448889839014)
[0.15, 3, 1, 0, 0.3]
[[11258 11589]
 [ 5095 17752]]
Precision: 0.6050236869909001 || Recall: 0.7769947914387009 || F1: 0.6803096497279068
[[11297 11550]
 [ 5153 17693]]
Precision: 0.605033683274

[[11258 11589]
 [ 5097 17750]]
Precision: 0.6049967619891612 || Recall: 0.7769072525933383 || F1: 0.680259073314682
[[11283 11564]
 [ 5142 17704]]
Precision: 0.6048927155938226 || Recall: 0.7749277772914296 || F1: 0.6794335495260391
[[11294 11553]
 [ 5044 17802]]
Precision: 0.6064384261624937 || Recall: 0.7792173684671277 || F1: 0.6820558993122736
[[11257 11589]
 [ 5084 17763]]
Precision: 0.6051717089125103 || Recall: 0.7774762550881954 || F1: 0.680587750723194
[[11390 11456]
 [ 5099 17748]]
Precision: 0.6077249691823038 || Recall: 0.7768197137479756 || F1: 0.6819465524197422
---------------------------
Overall Precision: 0.6058449163680584 (+/- 0.0010923472802681293) || Overall Recall: 0.7770696734376134 (+/- 0.0013750986914782906) || Overall F1: 0.6808565650591862 (+/- 0.0010080382766903438)
[0.15, 3, 3, 0, 0.5]
[[11247 11600]
 [ 5120 17727]]
Precision: 0.6044600538752686 || Recall: 0.7759005558716681 || F1: 0.6795338674435544
[[11319 11528]
 [ 5185 17661]]
Precision: 0.6050566994415

[[11264 11583]
 [ 5135 17712]]
Precision: 0.6046082949308755 || Recall: 0.7752440145314483 || F1: 0.6793755513789267
[[11285 11562]
 [ 5145 17701]]
Precision: 0.6048935515839114 || Recall: 0.774796463275847 || F1: 0.6793835997620373
[[11342 11505]
 [ 5098 17748]]
Precision: 0.6067070044098041 || Recall: 0.776853716186641 || F1: 0.6813182594675521
[[11249 11597]
 [ 5058 17789]]
Precision: 0.60535629211189 || Recall: 0.7786142600779096 || F1: 0.6811402753048839
[[11333 11513]
 [ 4980 17867]]
Precision: 0.6081347855684139 || Recall: 0.7820282750470521 || F1: 0.6842054875830509
---------------------------
Overall Precision: 0.6059399857209791 (+/- 0.0013125994844022599) || Overall Recall: 0.7775073458237796 (+/- 0.002629176473267361) || Overall F1: 0.6810846346992901 (+/- 0.0017670248886256972)
[0.15, 3, 5, 0, 0.7]
[[11207 11640]
 [ 5068 17779]]
Precision: 0.6043373330160781 || Recall: 0.7781765658510964 || F1: 0.6803275551984082
[[11215 11632]
 [ 5092 17754]]
Precision: 0.6041652487579119

[[11432 11415]
 [ 5239 17608]]
Precision: 0.6066912448747545 || Recall: 0.7706919945725916 || F1: 0.6789280894544053
[[11480 11367]
 [ 5294 17552]]
Precision: 0.6069366160655625 || Recall: 0.7682745338352447 || F1: 0.6781416014681736
[[11450 11397]
 [ 5178 17668]]
Precision: 0.6078788921383107 || Recall: 0.7733520091044385 || F1: 0.680703511779777
[[11387 11459]
 [ 5150 17697]]
Precision: 0.6069762656057073 || Recall: 0.7745874731912286 || F1: 0.680614579928081
[[11494 11352]
 [ 5096 17751]]
Precision: 0.609937119884548 || Recall: 0.7769510220160196 || F1: 0.6833878729547642
---------------------------
Overall Precision: 0.6076840277137766 (+/- 0.001196625723683242) || Overall Recall: 0.7727714065439046 (+/- 0.0030215956446849154) || Overall F1: 0.6803551311170402 (+/- 0.0018067853906487267)
[0.15, 8, 1, 0.2, 0.3]
[[11775 11072]
 [ 5576 17271]]
Precision: 0.6093568076773807 || Recall: 0.7559416991289885 || F1: 0.6747802305137722
[[11511 11336]
 [ 5328 17518]]
Precision: 0.6071255285229

[[11780 11067]
 [ 5597 17250]]
Precision: 0.6091747007098209 || Recall: 0.7550225412526809 || F1: 0.6743022437651474
[[11525 11322]
 [ 5338 17508]]
Precision: 0.6072840790842872 || Recall: 0.7663485949400333 || F1: 0.6776066258998373
[[11813 11034]
 [ 5544 17302]]
Precision: 0.6106013551665725 || Recall: 0.7573316992033616 || F1: 0.6760970653745457
[[11480 11366]
 [ 5204 17643]]
Precision: 0.6081905615498638 || Recall: 0.7722239243664376 || F1: 0.6804612773835236
[[11552 11294]
 [ 5126 17721]]
Precision: 0.6107530587627089 || Recall: 0.7756379393355801 || F1: 0.6833905364235857
---------------------------
Overall Precision: 0.6092007510546507 (+/- 0.0013465618438715562) || Overall Recall: 0.7653129398196187 (+/- 0.008062659714764626) || Overall F1: 0.6783715497693279 (+/- 0.003221019331858593)
[0.15, 8, 3, 0.2, 0.5]
[[11440 11407]
 [ 5239 17608]]
Precision: 0.6068585214544201 || Recall: 0.7706919945725916 || F1: 0.6790328178627898
[[11470 11377]
 [ 5290 17556]]
Precision: 0.60678118411

[[11451 11396]
 [ 5232 17615]]
Precision: 0.6071834821274689 || Recall: 0.7709983805313608 || F1: 0.6793551621736279
[[11453 11394]
 [ 5274 17572]]
Precision: 0.6066422702478769 || Recall: 0.7691499606057953 || F1: 0.6782984636763684
[[11456 11391]
 [ 5171 17675]]
Precision: 0.6080988096057249 || Recall: 0.7736584084741311 || F1: 0.6809600862998921
[[11428 11418]
 [ 5174 17673]]
Precision: 0.6075074765391358 || Recall: 0.773537007046877 || F1: 0.6805421849127806
[[11530 11316]
 [ 5100 17747]]
Precision: 0.6106389567491312 || Recall: 0.7767759443252944 || F1: 0.6837603544596418
---------------------------
Overall Precision: 0.6080141990538676 (+/- 0.0013946726405704999) || Overall Recall: 0.7728239401966916 (+/- 0.002594375932565856) || Overall F1: 0.6805832503044622 (+/- 0.0018420712765643715)
[0.15, 8, 5, 0.2, 0.7]
[[11419 11428]
 [ 5230 17617]]
Precision: 0.6065415734205543 || Recall: 0.7710859193767234 || F1: 0.6789871271101519
[[11480 11367]
 [ 5293 17553]]
Precision: 0.60695020746

[[11417 11430]
 [ 5280 17567]]
Precision: 0.6058212918577784 || Recall: 0.7688974482426577 || F1: 0.6776869068744696
[[11457 11390]
 [ 5341 17505]]
Precision: 0.6058141546980447 || Recall: 0.7662172809244506 || F1: 0.6766394155505305
[[11463 11384]
 [ 5199 17647]]
Precision: 0.6078674520340326 || Recall: 0.7724328109953602 || F1: 0.6803400350829848
[[11372 11474]
 [ 5215 17632]]
Precision: 0.6057857486428915 || Recall: 0.7717424607169432 || F1: 0.6787673474101592
[[11499 11347]
 [ 5157 17690]]
Precision: 0.6092227158453009 || Recall: 0.7742810872324594 || F1: 0.6819057898388714
---------------------------
Overall Precision: 0.6069022726156097 (+/- 0.0014082026753950325) || Overall Recall: 0.7707142176223741 (+/- 0.0028374811542195844) || Overall F1: 0.6790678989514032 (+/- 0.0018744042787116084)
[0.15, 12, 1, 0.4, 0.3]
[[11491 11356]
 [ 5306 17541]]
Precision: 0.6070180295532408 || Recall: 0.7677594432529435 || F1: 0.677991651205937
[[11540 11307]
 [ 5347 17499]]
Precision: 0.607477608

[[11514 11333]
 [ 5319 17528]]
Precision: 0.6073247635217075 || Recall: 0.7671904407580864 || F1: 0.677960857120755
[[11870 10977]
 [ 5684 17162]]
Precision: 0.6099008493549878 || Recall: 0.7512037118095072 || F1: 0.6732176130234383
[[11794 11053]
 [ 5522 17324]]
Precision: 0.6104944144906086 || Recall: 0.7582946686509674 || F1: 0.6764148917478477
[[11487 11359]
 [ 5220 17627]]
Precision: 0.608121161940247 || Recall: 0.7715236136035366 || F1: 0.6801458530279937
[[11587 11259]
 [ 5161 17686]]
Precision: 0.6110209017101399 || Recall: 0.7741060095417341 || F1: 0.6829626197096076
---------------------------
Overall Precision: 0.6093724182035382 (+/- 0.0014152134082185993) || Overall Recall: 0.7644636888727663 (+/- 0.008533883795161089) || Overall F1: 0.6781403669259285 (+/- 0.0033020929956645998)
[0.15, 12, 3, 0.4, 0.5]
[[11470 11377]
 [ 5285 17562]]
Precision: 0.6068627112201528 || Recall: 0.7686786011292511 || F1: 0.678252809639671
[[11473 11374]
 [ 5341 17505]]
Precision: 0.606149797430

[[11479 11368]
 [ 5278 17569]]
Precision: 0.6071465597677713 || Recall: 0.7689849870880203 || F1: 0.6785493588753282
[[11480 11367]
 [ 5330 17516]]
Precision: 0.6064466987501298 || Recall: 0.7666987656482536 || F1: 0.6772216744959307
[[11757 11090]
 [ 5536 17310]]
Precision: 0.6095070422535211 || Recall: 0.7576818699115819 || F1: 0.6755649221402646
[[11483 11363]
 [ 5242 17605]]
Precision: 0.6077395747031207 || Recall: 0.7705606863045477 || F1: 0.6795329537778636
[[11588 11258]
 [ 5183 17664]]
Precision: 0.610746144803264 || Recall: 0.7731430822427452 || F1: 0.6824161177538682
---------------------------
Overall Precision: 0.6083172040555614 (+/- 0.0015822971849913847) || Overall Recall: 0.7674138782390296 (+/- 0.0052993949585924335) || Overall F1: 0.6786570054086511 (+/- 0.0023035169942187306)
[0.15, 12, 5, 0.4, 0.7]
[[11432 11415]
 [ 5296 17551]]
Precision: 0.6059172823310087 || Recall: 0.7681971374797566 || F1: 0.6774747650203616
[[11437 11410]
 [ 5261 17585]]
Precision: 0.606483876

[[11251 11596]
 [ 5085 17762]]
Precision: 0.605013965528987 || Recall: 0.777432485665514 || F1: 0.6804712192318743
[[11304 11543]
 [ 5160 17686]]
Precision: 0.6050839919258271 || Recall: 0.774139893197934 || F1: 0.6792510801728276
[[11349 11498]
 [ 5061 17785]]
Precision: 0.6073489738073284 || Recall: 0.7784732557121596 || F1: 0.6823457192733411
[[11316 11530]
 [ 5074 17773]]
Precision: 0.6065249291881377 || Recall: 0.7779139493150086 || F1: 0.6816107382550336
[[11385 11461]
 [ 5079 17768]]
Precision: 0.6078894248862431 || Recall: 0.777695102201602 || F1: 0.6823872801290422
---------------------------
Overall Precision: 0.6063722570673047 (+/- 0.0011647888680234154) || Overall Recall: 0.7771309372184436 (+/- 0.0015343066713482195) || Overall F1: 0.6812132074124237 (+/- 0.001201628095068722)
[0.25, 3, 3, 0, 0.3]
[[11363 11484]
 [ 5177 17670]]
Precision: 0.6060917884338342 || Recall: 0.7734056987788331 || F1: 0.6796023153400896
[[11351 11496]
 [ 5187 17659]]
Precision: 0.6056937060538501

[[11671 11176]
 [ 5509 17338]]
Precision: 0.6080521848916322 || Recall: 0.7588742504486365 || F1: 0.6751426179396819
[[11353 11494]
 [ 5206 17640]]
Precision: 0.6054781355117732 || Recall: 0.7721264116256675 || F1: 0.67872258560985
[[11379 11468]
 [ 5122 17724]]
Precision: 0.6071526445601535 || Recall: 0.7758032040619802 || F1: 0.6811945117029863
[[11360 11486]
 [ 5151 17696]]
Precision: 0.6064012062230142 || Recall: 0.7745437037685473 || F1: 0.6802360222183782
[[11449 11397]
 [ 5105 17742]]
Precision: 0.6088747040049418 || Recall: 0.7765570972118878 || F1: 0.6825683837956372
---------------------------
Overall Precision: 0.6071917750383029 (+/- 0.0011946251039862247) || Overall Recall: 0.7715809334233439 (+/- 0.006528784300367884) || Overall F1: 0.6795728242533067 (+/- 0.00254529811334857)
[0.25, 3, 5, 0, 0.5]
[[11336 11511]
 [ 5152 17695]]
Precision: 0.6058686571252483 || Recall: 0.774499934345866 || F1: 0.6798839644208787
[[11369 11478]
 [ 5198 17648]]
Precision: 0.6059191100734739 

[[11432 11415]
 [ 5252 17595]]
Precision: 0.6065149948293692 || Recall: 0.7701229920777345 || F1: 0.6785969107352913
[[11457 11390]
 [ 5287 17559]]
Precision: 0.6065494490310546 || Recall: 0.7685809332049374 || F1: 0.6780191138140746
[[11459 11388]
 [ 5227 17619]]
Precision: 0.6074051091115937 || Recall: 0.7712072135165894 || F1: 0.679574952268914
[[11465 11381]
 [ 5222 17625]]
Precision: 0.6076329035371992 || Recall: 0.7714360747581739 || F1: 0.6798063757159664
[[11488 11358]
 [ 5104 17743]]
Precision: 0.6097041338785609 || Recall: 0.7766008666345691 || F1: 0.6831061831061831
---------------------------
Overall Precision: 0.6075613180775555 (+/- 0.0011610216693136773) || Overall Recall: 0.7715896160384008 (+/- 0.002701318099942927) || Overall F1: 0.679820707128086 (+/- 0.0017662598519436988)
[0.25, 8, 1, 0, 0.7]
[[11458 11389]
 [ 5278 17569]]
Precision: 0.6067062642447683 || Recall: 0.7689849870880203 || F1: 0.6782742978476981
[[11453 11394]
 [ 5326 17520]]
Precision: 0.60593484125337

[[11484 11363]
 [ 5285 17562]]
Precision: 0.6071564390665515 || Recall: 0.7686786011292511 || F1: 0.6784362203507688
[[11482 11365]
 [ 5336 17510]]
Precision: 0.6064069264069264 || Recall: 0.7664361376170883 || F1: 0.6770944103942307
[[11455 11392]
 [ 5204 17642]]
Precision: 0.6076324309430323 || Recall: 0.7722139543027225 || F1: 0.6801079414032382
[[11435 11411]
 [ 5215 17632]]
Precision: 0.6070998175119651 || Recall: 0.7717424607169432 || F1: 0.679591443438042
[[11487 11359]
 [ 5112 17735]]
Precision: 0.6095758575651337 || Recall: 0.7762507112531186 || F1: 0.6828902023449683
---------------------------
Overall Precision: 0.6075742942987218 (+/- 0.001074440727995636) || Overall Recall: 0.7710643730038248 (+/- 0.0033404165841431368) || Overall F1: 0.6796240435862496 (+/- 0.0019345790771646673)
[0.25, 8, 3, 0.2, 0.3]
[[11472 11375]
 [ 5290 17557]]
Precision: 0.6068367205862021 || Recall: 0.7684597540158445 || F1: 0.6781513741091948
[[11517 11330]
 [ 5335 17511]]
Precision: 0.60715647862

[[11505 11342]
 [ 5283 17564]]
Precision: 0.6076247145921262 || Recall: 0.7687661399746137 || F1: 0.6787625838115665
[[11526 11321]
 [ 5353 17493]]
Precision: 0.6071007149302422 || Recall: 0.7656920248621203 || F1: 0.6772357723577236
[[11466 11381]
 [ 5216 17630]]
Precision: 0.6077005273861639 || Recall: 0.7716886982403922 || F1: 0.6799467767128835
[[11455 11391]
 [ 5203 17644]]
Precision: 0.6076803857413466 || Recall: 0.772267693789119 || F1: 0.6801588219420994
[[11539 11307]
 [ 5161 17686]]
Precision: 0.6100093125926948 || Recall: 0.7741060095417341 || F1: 0.6823302469135802
---------------------------
Overall Precision: 0.6080231310485147 (+/- 0.0010174528748752379) || Overall Recall: 0.7705041132815958 (+/- 0.0029551264126109196) || Overall F1: 0.6796868403475707 (+/- 0.0016816608604595573)
[0.25, 8, 5, 0.2, 0.5]
[[11441 11406]
 [ 5269 17578]]
Precision: 0.6064725365719017 || Recall: 0.7693789118921521 || F1: 0.6782813374235496
[[11479 11368]
 [ 5287 17559]]
Precision: 0.6070107512

[[11456 11391]
 [ 5339 17508]]
Precision: 0.6058341119069864 || Recall: 0.7663150523044601 || F1: 0.6766899856993778
[[11450 11397]
 [ 5335 17511]]
Precision: 0.6057492735574928 || Recall: 0.7664799089556159 || F1: 0.6767013177725394
[[11442 11405]
 [ 5206 17640]]
Precision: 0.6073334480977793 || Recall: 0.7721264116256675 || F1: 0.6798866855524079
[[11460 11386]
 [ 5308 17539]]
Precision: 0.6063612791702679 || Recall: 0.7676719044075808 || F1: 0.6775477091864328
[[11505 11341]
 [ 5234 17613]]
Precision: 0.6083097326794226 || Recall: 0.7709108416859982 || F1: 0.6800254821335496
---------------------------
Overall Precision: 0.6067175690823898 (+/- 0.0009756359143759151) || Overall Recall: 0.7687008237958646 (+/- 0.002379154240431938) || Overall F1: 0.6781702360688615 (+/- 0.0014916124581861464)
[0.25, 12, 1, 0.2, 0.7]
[[11411 11436]
 [ 5337 17510]]
Precision: 0.6049195052857044 || Recall: 0.7664025911498228 || F1: 0.6761531481088178
[[11404 11443]
 [ 5339 17507]]
Precision: 0.604732297

[[11421 11426]
 [ 5295 17552]]
Precision: 0.6057008765270205 || Recall: 0.768240906902438 || F1: 0.6773564881813797
[[11412 11435]
 [ 5351 17495]]
Precision: 0.6047355686138957 || Recall: 0.7657795675391753 || F1: 0.6757957354758962
[[11445 11402]
 [ 5274 17572]]
Precision: 0.6064747704838821 || Recall: 0.7691499606057953 || F1: 0.6781937475878039
[[11363 11483]
 [ 5239 17608]]
Precision: 0.6052731085215358 || Recall: 0.7706919945725916 || F1: 0.6780392005853133
[[11466 11380]
 [ 5165 17682]]
Precision: 0.6084233707246576 || Recall: 0.7739309318510089 || F1: 0.6812691440790614
---------------------------
Overall Precision: 0.6061215389741983 (+/- 0.0012838151637960963) || Overall Recall: 0.7695586722942018 (+/- 0.0027050418348047896) || Overall F1: 0.678130863181891 (+/- 0.001784020264144109)
[0.25, 12, 3, 0.4, 0.3]
[[11484 11363]
 [ 5319 17528]]
Precision: 0.6066941261984701 || Recall: 0.7671904407580864 || F1: 0.6775677451776257
[[11499 11348]
 [ 5339 17507]]
Precision: 0.60672327153

[[11471 11376]
 [ 5301 17546]]
Precision: 0.6066662056565936 || Recall: 0.76797829036635 || F1: 0.6778574050107207
[[11520 11327]
 [ 5368 17478]]
Precision: 0.6067696580454782 || Recall: 0.7650354547842073 || F1: 0.676772956961143
[[11475 11372]
 [ 5245 17601]]
Precision: 0.6074966347979153 || Recall: 0.7704193294230938 || F1: 0.6793261159034331
[[11476 11370]
 [ 5232 17615]]
Precision: 0.6077281352423668 || Recall: 0.7709983805313608 || F1: 0.6796959407315945
[[11553 11293]
 [ 5157 17690]]
Precision: 0.6103577959493496 || Recall: 0.7742810872324594 || F1: 0.6826162454177117
---------------------------
Overall Precision: 0.6078036859383407 (+/- 0.0013406372614108279) || Overall Recall: 0.7697425084674941 (+/- 0.0030952485631458256) || Overall F1: 0.6792537328049206 (+/- 0.0019804675372481254)
[0.25, 12, 5, 0.4, 0.5]
[[11474 11373]
 [ 5324 17523]]
Precision: 0.6064161129568106 || Recall: 0.7669715936446798 || F1: 0.6773090079817561
[[11490 11357]
 [ 5364 17482]]
Precision: 0.60619300253

In [31]:
# get max f1 
max_f1

0.6820951438460228

In [32]:
best_parameters

[0.05, 8, 3, 0.2, 0.7]

In [21]:
# test model on testing set
xg = xgb.XGBClassifier(random_state=88)
xg.fit(X_train_one_hot, y_train)
y_predict = xg.predict(X_test_one_hot)

In [22]:
confusion_matrix(y_test, y_predict)

array([[172772, 169336],
       [  1386,   4627]])

In [23]:
f1_score(y_test,y_predict)

0.05141796684002311

In [24]:
recall_score(y_test,y_predict)

0.7694994179278231

# 7. Support Vector Machines

## 7.1 Only one hot encoded columns

In [53]:
# fit SVM
svm = SVC(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(svm, X_train_one_hot, y_train)

[[11322 11525]
 [ 5188 17659]]
Precision: 0.6050918311403509 || Recall: 0.7729242351293386 || F1: 0.6787876458265265
[[11239 11608]
 [ 5148 17698]]
Precision: 0.6039036374803795 || Recall: 0.7746651492602644 || F1: 0.6787083908574936
[[11292 11555]
 [ 5029 17817]]
Precision: 0.6065981206591311 || Recall: 0.7798739385450407 || F1: 0.6824083649316328
[[11263 11583]
 [ 5100 17747]]
Precision: 0.6050801227412206 || Recall: 0.7767759443252944 || F1: 0.6802614178661096
[[11346 11500]
 [ 5048 17799]]
Precision: 0.6074951363527765 || Recall: 0.7790519543047227 || F1: 0.6826602232194223
---------------------------
Overall Precision: 0.6056337696747717 (+/- 0.0012639808893503796) || Overall Recall: 0.7766582443129322 (+/- 0.002607667259611701) || Overall F1: 0.6805652085402369 (+/- 0.0017021223146974187)


## 7.2 Only one hot encoded columns with frequencies

In [None]:
# fit SVM
svm = SVC(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(svm, X_train_one_hot_more, y_train)

[[21481  1366]
 [19945  2902]]
Precision: 0.6799437675726335 || Recall: 0.12701886462117565 || F1: 0.21405126313848424
[[21490  1357]
 [19946  2900]]
Precision: 0.6812309137890533 || Recall: 0.1269368817298433 || F1: 0.21399845035604914
[[21436  1411]
 [19869  2977]]
Precision: 0.6784412032816773 || Recall: 0.13030727479646328 || F1: 0.21862377909965486


## 7.3 Only with frequencies

In [None]:
# fit SVM
svm = SVC(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(svm, X_train_one_hot_frequency, y_train)

## 7.4 Scaled one hot encoded pids and frequencies

In [None]:
# fit SVM
svm = SVC(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(svm, X_train_one_hot_more_scaled, y_train)

## 7.5 Frequencies normalized

In [None]:
# fit SVM
svm = SVC(random_state=88)
util_ml.get_cross_validated_two_class_confusion_matrix(svm, X_train_one_hot_frequency_scaled, y_train)