In [None]:
import pickle

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from xgboost import XGBClassifier

# Loading the data

In [None]:
with open("data/express/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [3]:
with open("data/express/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [4]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

In [5]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [6]:
shaped_train_features = reshape_to_train(unshaped=train_features)
shaped_val_features   = reshape_to_train(unshaped=val_features)
shaped_test_features  = reshape_to_train(unshaped=test_features)

# Optimzing the Model over the validation data

In [None]:
estimatores = [pow(2, i) for i in range(5, 11)]
learning_rates = [i/10 for i in range(2, 8)]
grows = ["depthwise", "lossguide"]

hps = dict()
i = 0

for estimator in estimatores:
    for lr in learning_rates:
        for gp in grows:
            hps[i] = {"estimators": estimator, "lr": lr, "gp": gp}
            i += 1

In [None]:
df = pd.DataFrame(columns=["accuracy", "F1-score"], index=[i for i in range(len(hps))])

for index, hp in hps.items():
    print(index)
    lr_model = XGBClassifier(objective="multi:softprob", learning_rate=hp["lr"], n_estimators=hp["estimators"], grow_policy=hp["gp"])

    lr_model.fit(X=shaped_train_features, y=train_targets)

    predictions = lr_model.predict(shaped_val_features)

    acc = accuracy_score(y_true=val_targets, y_pred=predictions)
    acc = round(acc*100, 2)

    f1score = f1_score(y_true=val_targets, y_pred=predictions, average="macro")
    f1score = round(f1score*100, 2)
    print(acc)
    print(f1score)
    df["accuracy"].loc[index] = acc
    df["F1-score"].loc[index] = f1score

print(df)

print(hps)

In [None]:
df.sort_values(by="F1-score", ascending=False)

In [None]:
print(hps[30])
print(hps[46])
print(hps[62])

# Training the model

In [None]:
xgb_model = XGBClassifier(objective="multi:softprob", learning_rate=0.5, n_estimators=128, grow_policy="depthwise")

xgb_model.fit(shaped_train_features, train_targets)

# Testing the model

In [None]:
predictions = xgb_model.predict(shaped_test_features)

# Metrics

In [None]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc*100, 2)

print(f"The accuracy is {accuracy}%")

f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
f1 = round(f1*100, 2)

print(f"The F1-Score is {f1}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="turbo")

# Training on multiple random states

In [7]:
accs = list()
f1s  = list()

for i in range(30):
    print(f"---- Model {i} ----")
    xgb_model = XGBClassifier(objective="multi:softprob", learning_rate=0.5, n_estimators=128, grow_policy="depthwise", random_state=i, subsample=0.5)

    xgb_model.fit(shaped_train_features, train_targets)

    predictions = xgb_model.predict(shaped_test_features)
    acc = accuracy_score(y_true=test_targets, y_pred=predictions)
    accuracy = round(acc*100, 2)

    f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
    f1 = round(f1*100, 2)

    conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)

    accs.append(accuracy)
    f1s.append(f1)

---- Model 0 ----
---- Model 1 ----
---- Model 2 ----
---- Model 3 ----
---- Model 4 ----
---- Model 5 ----
---- Model 6 ----
---- Model 7 ----
---- Model 8 ----
---- Model 9 ----
---- Model 10 ----
---- Model 11 ----
---- Model 12 ----
---- Model 13 ----
---- Model 14 ----
---- Model 15 ----
---- Model 16 ----
---- Model 17 ----
---- Model 18 ----
---- Model 19 ----
---- Model 20 ----
---- Model 21 ----
---- Model 22 ----
---- Model 23 ----
---- Model 24 ----
---- Model 25 ----
---- Model 26 ----
---- Model 27 ----
---- Model 28 ----
---- Model 29 ----


In [12]:
df = pd.DataFrame([accs, f1s]).T
df.columns = ["Taxa de acerto", "F1-score"]

fig = go.Figure()

fig.add_trace(go.Box(y=df["Taxa de acerto"], name="Taxa de acerto"))
fig.add_trace(go.Box(y=df["F1-score"], name="F1-score"))
fig.update_layout(yaxis_title="%")
fig.show()

In [9]:
acc_mean = np.mean(accs)
interval = max(accs) - acc_mean

print(acc_mean)
print(interval)

99.40099999999997
0.06900000000003104


In [11]:
f1s_mean = np.mean(f1s)
f1_interval = max(f1s) - f1s_mean

print(f1s_mean)
print(f1_interval)

99.35533333333329
0.07466666666671529


# Saving the model

In [None]:
# with open("models/express_decision_tree", "wb") as f:
#     pickle.dump(rf_model, f)