In [8]:
import pickle

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from xgboost import XGBClassifier

# Loading the data

In [9]:
with open("data/express/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [10]:
with open("data/express/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [11]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

In [12]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [13]:
shaped_train_features = reshape_to_train(unshaped=train_features)
shaped_val_features   = reshape_to_train(unshaped=val_features)
shaped_test_features  = reshape_to_train(unshaped=test_features)

# Optimzing the Model over the validation data

In [14]:
estimatores = [pow(2, i) for i in range(5, 11)]
learning_rates = [i/10 for i in range(2, 8)]
grows = ["depthwise", "lossguide"]

hps = dict()
i = 0

for estimator in estimatores:
    for lr in learning_rates:
        for gp in grows:
            hps[i] = {"estimators": estimator, "lr": lr, "gp": gp}
            i += 1

In [19]:
df = pd.DataFrame(columns=["accuracy", "F1-score"], index=[i for i in range(len(hps))])

for index, hp in hps.items():
    print(index)
    lr_model = XGBClassifier(objective="multi:softprob", learning_rate=hp["lr"], n_estimators=hp["estimators"], grow_policy=hp["gp"])

    lr_model.fit(X=shaped_train_features, y=train_targets)

    predictions = lr_model.predict(shaped_val_features)

    acc = accuracy_score(y_true=val_targets, y_pred=predictions)
    acc = round(acc*100, 2)

    f1score = f1_score(y_true=val_targets, y_pred=predictions, average="macro")
    f1score = round(f1score*100, 2)
    print(acc)
    print(f1score)
    df["accuracy"].loc[index] = acc
    df["F1-score"].loc[index] = f1score

print(df)

print(hps)

0
97.93
89.1
1
97.74
88.95
2
97.93
89.12
3
97.76
88.98
4
97.59
88.8
5
98.03
89.19
6
97.9
89.11
7
97.74
88.93
8
97.93
89.12
9
97.93
89.12
10
97.93
89.08
11
97.63
88.87
12
97.86
89.04
13
97.64
88.83
14
97.91
89.1
15
97.86
89.05
16
97.73
88.91
17
98.04
89.21
18
97.86
89.04
19
97.83
89.02
20
97.8
88.98
21
97.84
89.04
22
97.8
88.99
23
97.69
88.88
24
97.96
89.12
25
97.81
88.99
26
97.9
89.05
27
97.77
88.98
28
97.8
89.0
29
97.86
89.04
30
98.19
89.33
31
97.82
89.01
32
97.82
89.02
33
97.74
88.92
34
97.79
89.02
35
97.68
88.89
36
97.52
88.77
37
97.66
88.81
38
97.86
89.04
39
98.1
89.24
40
97.83
89.02
41
97.66
88.87
42
98.01
89.17
43
97.81
89.0
44
97.72
88.92
45
97.87
89.04
46
98.14
89.31
47
97.77
89.02
48
98.03
89.21
49
97.86
89.04
50
97.67
88.89
51
97.91
89.1
52
97.81
89.01
53
97.7
88.9
54
97.61
88.83
55
97.6
88.82
56
97.74
88.96
57
97.89
89.04
58
98.07
89.25
59
97.7
88.89
60
97.69
88.89
61
98.0
89.16
62
98.08
89.26
63
97.91
89.12
64
97.96
89.13
65
97.87
89.03
66
97.84
89.04
67
97.8
88.99
68
97.99

In [20]:
df.sort_values(by="F1-score", ascending=False)

Unnamed: 0,accuracy,F1-score
30,98.19,89.33
46,98.14,89.31
62,98.08,89.26
58,98.07,89.25
39,98.1,89.24
...,...,...
13,97.64,88.83
55,97.6,88.82
37,97.66,88.81
4,97.59,88.8


In [21]:
print(hps[30])
print(hps[46])
print(hps[62])

{'estimators': 128, 'lr': 0.5, 'gp': 'depthwise'}
{'estimators': 256, 'lr': 0.7, 'gp': 'depthwise'}
{'estimators': 1024, 'lr': 0.3, 'gp': 'depthwise'}


# Training the model

In [23]:
xgb_model = XGBClassifier(objective="multi:softprob", learning_rate=0.5, n_estimators=128, grow_policy="depthwise")

xgb_model.fit(shaped_train_features, train_targets)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.5, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=128,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

# Testing the model

In [24]:
predictions = xgb_model.predict(shaped_test_features)

# Metrics

In [25]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc*100, 2)

print(f"The accuracy is {accuracy}%")

f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
f1 = round(f1*100, 2)

print(f"The F1-Score is {f1}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="turbo")

The accuracy is 99.47%
The F1-Score is 99.43%
              precision    recall  f1-score   support

           0     1.0000    0.9907    0.9953       107
           1     0.9916    1.0000    0.9958       118
           2     0.9821    1.0000    0.9910       110
           3     0.9899    0.9899    0.9899        99
           4     1.0000    0.9922    0.9961       128
           5     0.9894    0.9789    0.9841        95
           6     1.0000    1.0000    1.0000       145
           7     1.0000    1.0000    1.0000       111
           8     0.9914    1.0000    0.9957       115
           9     1.0000    0.9897    0.9948        97

    accuracy                         0.9947      1125
   macro avg     0.9944    0.9941    0.9943      1125
weighted avg     0.9947    0.9947    0.9947      1125



# Saving the model

In [9]:
with open("models/express_decision_tree", "wb") as f:
    pickle.dump(rf_model, f)