In [1]:
import pickle

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from xgboost import XGBClassifier

# Loading the data

In [2]:
with open("data/randomized/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [3]:
with open("data/randomized/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [4]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

In [5]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [6]:
shaped_train_features = reshape_to_train(unshaped=train_features)
shaped_val_features   = reshape_to_train(unshaped=val_features)
shaped_test_features  = reshape_to_train(unshaped=test_features)

# Optimzing the Model over the validation data

In [7]:
estimatores = [pow(2, i) for i in range(5, 11)]
learning_rates = [i/10 for i in range(2, 8)]
grows = ["depthwise", "lossguide"]

hps = dict()
i = 0

for estimator in estimatores:
    for lr in learning_rates:
        for gp in grows:
            hps[i] = {"estimators": estimator, "lr": lr, "gp": gp}
            i += 1


In [11]:
df = pd.DataFrame(columns=["accuracy", "F1-score"], index=[i for i in range(len(hps))])

for index, hp in hps.items():    
    lr_model = XGBClassifier(objective="multi:softprob", learning_rate=hp["lr"], n_estimators=hp["estimators"], grow_policy=hp["gp"])

    lr_model.fit(X=shaped_train_features, y=train_targets)

    predictions = lr_model.predict(shaped_val_features)

    new = list()

    for pred in predictions:
        if np.random.choice([True, False], p=[0.02, 0.98]):
            pred = np.random.choice(range(10))
        
        new.append(pred)

    acc = accuracy_score(y_true=val_targets, y_pred=new)
    acc = round(acc*100, 2)

    f1score = f1_score(y_true=val_targets, y_pred=new, average="macro")
    f1score = round(f1score*100, 2)

    df["accuracy"].loc[index] = acc
    df["F1-score"].loc[index] = f1score


fig = px.line(df.sort_values(by="F1-score", ascending=False).reset_index().drop("index", axis=1), markers=True)
fig.update_layout(yaxis_title="Percentual [%]", legend_title="Métrica")

fig.show()

print(df.sort_values(by="accuracy", ascending=False).reset_index())

print(hps)

    index accuracy F1-score
0      60    99.11     99.0
1      49    99.11    99.05
2      69    99.11    99.05
3       3    99.11    98.86
4      14    99.11     99.1
..    ...      ...      ...
67     61    96.44    96.41
68     41    96.44    95.97
69     70    96.44    96.09
70     20     96.0    95.33
71      4    94.22    93.27

[72 rows x 3 columns]
{0: {'estimators': 32, 'lr': 0.2, 'gp': 'depthwise'}, 1: {'estimators': 32, 'lr': 0.2, 'gp': 'lossguide'}, 2: {'estimators': 32, 'lr': 0.3, 'gp': 'depthwise'}, 3: {'estimators': 32, 'lr': 0.3, 'gp': 'lossguide'}, 4: {'estimators': 32, 'lr': 0.4, 'gp': 'depthwise'}, 5: {'estimators': 32, 'lr': 0.4, 'gp': 'lossguide'}, 6: {'estimators': 32, 'lr': 0.5, 'gp': 'depthwise'}, 7: {'estimators': 32, 'lr': 0.5, 'gp': 'lossguide'}, 8: {'estimators': 32, 'lr': 0.6, 'gp': 'depthwise'}, 9: {'estimators': 32, 'lr': 0.6, 'gp': 'lossguide'}, 10: {'estimators': 32, 'lr': 0.7, 'gp': 'depthwise'}, 11: {'estimators': 32, 'lr': 0.7, 'gp': 'lossguide'}, 12

In [12]:

fig = px.line(df.sort_values(by="F1-score", ascending=False).reset_index().drop("index", axis=1), markers=True)
fig.update_layout(yaxis_title="Percentual [%]", legend_title="Métrica")

fig.show()

print(df.sort_values(by="accuracy", ascending=False).reset_index())

print(hps)

    index accuracy F1-score
0      60    99.11     99.0
1      49    99.11    99.05
2      69    99.11    99.05
3       3    99.11    98.86
4      14    99.11     99.1
..    ...      ...      ...
67     61    96.44    96.41
68     41    96.44    95.97
69     70    96.44    96.09
70     20     96.0    95.33
71      4    94.22    93.27

[72 rows x 3 columns]
{0: {'estimators': 32, 'lr': 0.2, 'gp': 'depthwise'}, 1: {'estimators': 32, 'lr': 0.2, 'gp': 'lossguide'}, 2: {'estimators': 32, 'lr': 0.3, 'gp': 'depthwise'}, 3: {'estimators': 32, 'lr': 0.3, 'gp': 'lossguide'}, 4: {'estimators': 32, 'lr': 0.4, 'gp': 'depthwise'}, 5: {'estimators': 32, 'lr': 0.4, 'gp': 'lossguide'}, 6: {'estimators': 32, 'lr': 0.5, 'gp': 'depthwise'}, 7: {'estimators': 32, 'lr': 0.5, 'gp': 'lossguide'}, 8: {'estimators': 32, 'lr': 0.6, 'gp': 'depthwise'}, 9: {'estimators': 32, 'lr': 0.6, 'gp': 'lossguide'}, 10: {'estimators': 32, 'lr': 0.7, 'gp': 'depthwise'}, 11: {'estimators': 32, 'lr': 0.7, 'gp': 'lossguide'}, 12

In [None]:
df.sort_values(by="F1-score", ascending=False)

In [None]:
print(hps[26])
print(hps[24])
print(hps[5])

# Training a baseline model

In [None]:
xgb_model = XGBClassifier(objective="multi:softprob", learning_rate=0.3, n_estimators=128, grow_policy="depthwise", random_state=42, subsample=0.5)

xgb_model.fit(shaped_train_features, train_targets)

# Testing the baseline model

# Metrics

In [None]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc*100, 2)

print(f"The accuracy is {accuracy}%")

f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
f1 = round(f1*100, 2)

print(f"The F1-Score is {f1}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="turbo")

# Training on multiple random states

In [7]:
accs = list()
f1s  = list()

for i in range(30):
    print(f"---- Model {i} ----")
    xgb_model = XGBClassifier(objective="multi:softprob", learning_rate=0.3, n_estimators=128, grow_policy="depthwise", random_state=i, subsample=0.5)

    xgb_model.fit(shaped_train_features, train_targets)

    predictions = xgb_model.predict(shaped_test_features)
    acc = accuracy_score(y_true=test_targets, y_pred=predictions)
    accuracy = round(acc*100, 2)

    # print(f"The accuracy is {accuracy}%")

    f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
    f1 = round(f1*100, 2)

    # print(f"The F1-Score is {f1}%")

    # print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

    conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
    # px.imshow(conf_matrix, color_continuous_scale="turbo")

    accs.append(accuracy)
    f1s.append(f1)


---- Model 0 ----
---- Model 1 ----
---- Model 2 ----
---- Model 3 ----
---- Model 4 ----
---- Model 5 ----
---- Model 6 ----
---- Model 7 ----
---- Model 8 ----
---- Model 9 ----
---- Model 10 ----
---- Model 11 ----
---- Model 12 ----
---- Model 13 ----
---- Model 14 ----
---- Model 15 ----
---- Model 16 ----
---- Model 17 ----
---- Model 18 ----
---- Model 19 ----
---- Model 20 ----
---- Model 21 ----
---- Model 22 ----
---- Model 23 ----
---- Model 24 ----
---- Model 25 ----
---- Model 26 ----
---- Model 27 ----
---- Model 28 ----
---- Model 29 ----


In [8]:
print(f"Média acc: {np.mean(accs)}")
print(f"Média f1: {np.mean(f1s)}")

Média acc: 99.42499999999998
Média f1: 99.38133333333332


In [33]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [54]:
df = pd.DataFrame([accs, f1s]).T
df.columns = ["Taxa de acerto", "F1-score"]

fig = go.Figure()

fig.add_trace(go.Box(y=df["Taxa de acerto"], name="Taxa de acerto"))
fig.add_trace(go.Box(y=df["F1-score"], name="F1-score"))
fig.update_layout(yaxis_title="%")
fig.show()

In [46]:
acc_mean = (99.56 + 99.29)/2
interval = 99.56 - acc_mean

print(acc_mean)
print(interval)

99.42500000000001
0.1349999999999909


In [45]:
acc_mean = (99.51 + 99.24)/2
interval = 99.55 - acc_mean

print(acc_mean)
print(interval)

99.375
0.17499999999999716


# Saving the model

In [None]:
with open("models/xgboost_model", "wb") as f:
    pickle.dump(xgb_model, f)