In [20]:
import pickle

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from xgboost import XGBClassifier

# Loading the data

In [21]:
with open("data/randomized/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [22]:
with open("data/randomized/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [23]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

In [24]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [25]:
shaped_train_features = reshape_to_train(unshaped=train_features)
shaped_val_features   = reshape_to_train(unshaped=val_features)
shaped_test_features  = reshape_to_train(unshaped=test_features)

# Optimzing the Model over the validation data

In [33]:
estimatores = [pow(2, i) for i in range(5, 11)]
learning_rates = [i/10 for i in range(2, 8)]
grows = ["depthwise", "lossguide"]

hps = dict()
i = 0

for estimator in estimatores:
    for lr in learning_rates:
        for gp in grows:
            hps[i] = {"estimators": estimator, "lr": lr, "gp": gp}
            i += 1


In [34]:
df = pd.DataFrame(columns=["accuracy", "F1-score"], index=[i for i in range(len(hps))])

for index, hp in hps.items():    
    lr_model = XGBClassifier(objective="multi:softprob", learning_rate=hp["lr"], n_estimators=hp["estimators"], grow_policy=hp["gp"])

    lr_model.fit(X=shaped_train_features, y=train_targets)

    predictions = lr_model.predict(shaped_val_features)

    acc = accuracy_score(y_true=val_targets, y_pred=new)
    acc = round(acc*100, 2)

    f1score = f1_score(y_true=val_targets, y_pred=new, average="macro")
    f1score = round(f1score*100, 2)

    df["accuracy"].loc[index] = acc
    df["F1-score"].loc[index] = f1score

print(df)

print(hps)

   accuracy F1-score
0     96.44    96.16
1     96.89    88.94
2     97.78    97.61
3      96.0    87.15
4     98.22    97.72
..      ...      ...
67    97.78    89.19
68     96.0    87.34
69    97.33    88.42
70    98.22    89.39
71    98.67    98.47

[72 rows x 2 columns]
{0: {'estimators': 32, 'lr': 0.2, 'gp': 'depthwise'}, 1: {'estimators': 32, 'lr': 0.2, 'gp': 'lossguide'}, 2: {'estimators': 32, 'lr': 0.3, 'gp': 'depthwise'}, 3: {'estimators': 32, 'lr': 0.3, 'gp': 'lossguide'}, 4: {'estimators': 32, 'lr': 0.4, 'gp': 'depthwise'}, 5: {'estimators': 32, 'lr': 0.4, 'gp': 'lossguide'}, 6: {'estimators': 32, 'lr': 0.5, 'gp': 'depthwise'}, 7: {'estimators': 32, 'lr': 0.5, 'gp': 'lossguide'}, 8: {'estimators': 32, 'lr': 0.6, 'gp': 'depthwise'}, 9: {'estimators': 32, 'lr': 0.6, 'gp': 'lossguide'}, 10: {'estimators': 32, 'lr': 0.7, 'gp': 'depthwise'}, 11: {'estimators': 32, 'lr': 0.7, 'gp': 'lossguide'}, 12: {'estimators': 64, 'lr': 0.2, 'gp': 'depthwise'}, 13: {'estimators': 64, 'lr': 0.2

In [36]:
df.sort_values(by="F1-score", ascending=False)

Unnamed: 0,accuracy,F1-score
26,99.56,99.46
24,98.67,98.59
5,98.67,98.54
64,98.67,98.5
42,98.67,98.5
...,...,...
3,96.0,87.15
18,96.0,87.05
61,96.0,87.01
52,95.56,86.87


In [37]:
print(hps[26])
print(hps[24])
print(hps[5])

{'estimators': 128, 'lr': 0.3, 'gp': 'depthwise'}
{'estimators': 128, 'lr': 0.2, 'gp': 'depthwise'}
{'estimators': 32, 'lr': 0.4, 'gp': 'lossguide'}


# Training a baseline model

In [38]:
xgb_model = XGBClassifier(objective="multi:softprob", learning_rate=0.3, n_estimators=128, grow_policy="depthwise")

xgb_model.fit(shaped_train_features, train_targets)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=128,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

# Testing the baseline model

In [39]:
predictions = xgb_model.predict(shaped_test_features)

# Metrics

In [40]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc*100, 2)

print(f"The accuracy is {accuracy}%")

f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
f1 = round(f1*100, 2)

print(f"The F1-Score is {f1}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="turbo")

The accuracy is 99.29%
The F1-Score is 99.25%
              precision    recall  f1-score   support

           0     1.0000    0.9813    0.9906       107
           1     0.9916    1.0000    0.9958       118
           2     0.9821    1.0000    0.9910       110
           3     0.9800    0.9899    0.9849        99
           4     1.0000    0.9922    0.9961       128
           5     0.9894    0.9789    0.9841        95
           6     0.9932    1.0000    0.9966       145
           7     1.0000    0.9910    0.9955       111
           8     0.9914    1.0000    0.9957       115
           9     1.0000    0.9897    0.9948        97

    accuracy                         0.9929      1125
   macro avg     0.9928    0.9923    0.9925      1125
weighted avg     0.9930    0.9929    0.9929      1125



# Saving the model

In [None]:
with open("models/xgboost_model", "wb") as f:
    pickle.dump(xgb_model, f)