In [1]:
import pickle

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from xgboost import XGBClassifier

# Loading the data

In [2]:
with open("data/express/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [3]:
with open("data/express/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [4]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

In [5]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [6]:
shaped_train_features = reshape_to_train(unshaped=train_features)
shaped_val_features   = reshape_to_train(unshaped=val_features)
shaped_test_features  = reshape_to_train(unshaped=test_features)

# Optimzing the Model over the validation data

In [7]:
estimatores = [pow(2, i) for i in range(5, 11)]
learning_rates = [i/10 for i in range(2, 8)]
grows = ["depthwise", "lossguide"]

hps = dict()
i = 0

for estimator in estimatores:
    for lr in learning_rates:
        for gp in grows:
            hps[i] = {"estimators": estimator, "lr": lr, "gp": gp}
            i += 1

In [8]:
df = pd.DataFrame(columns=["accuracy", "F1-score"], index=[i for i in range(len(hps))])

for index, hp in hps.items():
    print(index)
    lr_model = XGBClassifier(objective="multi:softprob", learning_rate=hp["lr"], n_estimators=hp["estimators"], grow_policy=hp["gp"])

    lr_model.fit(X=shaped_train_features, y=train_targets)

    predictions = lr_model.predict(shaped_val_features)

    new = list()

    for pred in predictions:
        if np.random.choice([True, False], p=[0.02, 0.98]):
            pred = np.random.choice(range(10))
        
        new.append(pred)
    acc = accuracy_score(y_true=val_targets, y_pred=new)
    acc = round(acc*100, 2)

    f1score = f1_score(y_true=val_targets, y_pred=new, average="macro")
    f1score = round(f1score*100, 2)
    print(acc)
    print(f1score)
    df["accuracy"].loc[index] = acc
    df["F1-score"].loc[index] = f1score

fig = px.line(df.sort_values(by="accuracy", ascending=False).reset_index().drop("index", axis=1), markers=True)
fig.update_layout(yaxis_title="Percentual [%]", legend_title="Métrica")

fig.show()

# print(df.sort_values(by="accuracy", ascending=False).reset_index())

# print(hps)

0
98.11
98.11
1
98.29
98.29
2
97.84
97.84
3
97.7
97.7
4
98.16
98.15
5
98.13
98.13
6
98.08
98.08
7
98.14
98.14
8
98.02
98.02
9
98.37
98.36
10
98.27
98.26
11
98.03
98.03
12
98.21
98.21
13
98.02
98.02
14
97.91
97.91
15
97.86
97.85
16
98.46
98.46
17
97.81
97.81
18
98.01
98.01
19
97.84
97.84
20
97.89
97.89
21
98.01
98.01
22
97.84
97.85
23
98.2
98.2
24
98.16
98.15
25
98.01
98.01
26
97.76
97.75
27
98.22
98.22
28
97.94
97.95
29
97.84
97.84
30
97.96
97.95
31
98.14
98.14
32
98.1
98.1
33
97.98
97.98
34
98.02
98.02
35
98.09
98.09
36
98.14
98.15
37
98.16
98.15
38
98.26
98.25
39
98.18
98.18
40
97.93
97.93
41
97.96
97.96
42
98.09
98.09
43
98.09
98.09
44
98.26
98.26
45
98.09
98.08
46
98.1
98.1
47
98.21
98.21
48
97.64
97.64
49
98.03
98.03
50
97.96
97.95
51
97.91
97.91
52
98.1
98.1
53
98.24
98.25
54
97.86
97.85
55
98.06
98.05
56
97.86
97.86
57
98.1
98.1
58
98.29
98.29
59
98.18
98.18
60
97.86
97.85
61
98.0
98.0
62
98.19
98.19
63
98.18
98.18
64
98.1
98.1
65
97.93
97.93
66
98.21
98.21
67
97.86
97.85
68
98.

In [9]:
df.sort_values(by="accuracy", ascending=False).reset_index()

Unnamed: 0,index,accuracy,F1-score
0,16,98.46,98.46
1,9,98.37,98.36
2,1,98.29,98.29
3,58,98.29,98.29
4,10,98.27,98.26
...,...,...,...
67,29,97.84,97.84
68,17,97.81,97.81
69,26,97.76,97.75
70,3,97.7,97.7


In [None]:
df.sort_values(by="F1-score", ascending=False)

In [11]:
print(hps[3])

{'estimators': 32, 'lr': 0.3, 'gp': 'lossguide'}


In [None]:
print(hps[30])
print(hps[46])
print(hps[62])

# Training the model

In [None]:
xgb_model = XGBClassifier(objective="multi:softprob", learning_rate=0.5, n_estimators=128, grow_policy="depthwise")

xgb_model.fit(shaped_train_features, train_targets)

# Testing the model

In [None]:
predictions = xgb_model.predict(shaped_test_features)

# Metrics

In [None]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc*100, 2)

print(f"The accuracy is {accuracy}%")

f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
f1 = round(f1*100, 2)

print(f"The F1-Score is {f1}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="turbo")

# Training on multiple random states

In [7]:
accs = list()
f1s  = list()

for i in range(30):
    print(f"---- Model {i} ----")
    xgb_model = XGBClassifier(objective="multi:softprob", learning_rate=0.5, n_estimators=128, grow_policy="depthwise", random_state=i, subsample=0.5)

    xgb_model.fit(shaped_train_features, train_targets)

    predictions = xgb_model.predict(shaped_test_features)
    acc = accuracy_score(y_true=test_targets, y_pred=predictions)
    accuracy = round(acc*100, 2)

    f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
    f1 = round(f1*100, 2)

    conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)

    accs.append(accuracy)
    f1s.append(f1)

---- Model 0 ----
---- Model 1 ----
---- Model 2 ----
---- Model 3 ----
---- Model 4 ----
---- Model 5 ----
---- Model 6 ----
---- Model 7 ----
---- Model 8 ----
---- Model 9 ----
---- Model 10 ----
---- Model 11 ----
---- Model 12 ----
---- Model 13 ----
---- Model 14 ----
---- Model 15 ----
---- Model 16 ----
---- Model 17 ----
---- Model 18 ----
---- Model 19 ----
---- Model 20 ----
---- Model 21 ----
---- Model 22 ----
---- Model 23 ----
---- Model 24 ----
---- Model 25 ----
---- Model 26 ----
---- Model 27 ----
---- Model 28 ----
---- Model 29 ----


In [12]:
df = pd.DataFrame([accs, f1s]).T
df.columns = ["Taxa de acerto", "F1-score"]

fig = go.Figure()

fig.add_trace(go.Box(y=df["Taxa de acerto"], name="Taxa de acerto"))
fig.add_trace(go.Box(y=df["F1-score"], name="F1-score"))
fig.update_layout(yaxis_title="%")
fig.show()

In [9]:
acc_mean = np.mean(accs)
interval = max(accs) - acc_mean

print(acc_mean)
print(interval)

99.40099999999997
0.06900000000003104


In [11]:
f1s_mean = np.mean(f1s)
f1_interval = max(f1s) - f1s_mean

print(f1s_mean)
print(f1_interval)

99.35533333333329
0.07466666666671529


# Saving the model

In [None]:
# with open("models/express_decision_tree", "wb") as f:
#     pickle.dump(rf_model, f)