In [1]:
import pickle

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

# Loading the data

In [2]:
with open("data/randomized/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [3]:
with open("data/randomized/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [4]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

In [5]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [6]:
shaped_train_features = reshape_to_train(unshaped=train_features)
shaped_val_features   = reshape_to_train(unshaped=val_features)
shaped_test_features  = reshape_to_train(unshaped=test_features)

# Optimzing the Model over the validation data

In [7]:
estimatores = [pow(2, i) for i in range(8)]
criterions = ["gini", "entropy"]
boots = [True, False]

hps = dict()
i = 0

for estimator in estimatores:
    for criterion in criterions:
        for boot in boots:
            hps[i] = {"estimators": estimator, "criterion": criterion, "bootstrap": boot}
            i += 1

In [10]:
df = pd.DataFrame(columns=["accuracy", "F1-score"], index=[i for i in range(len(hps))])

for index, hp in hps.items():    
    lr_model = RandomForestClassifier(bootstrap=hp["bootstrap"], n_estimators=hp["estimators"], criterion=hp["criterion"])

    lr_model.fit(X=shaped_train_features, y=train_targets)

    predictions = lr_model.predict(shaped_val_features)
    
    new = list()

    for pred in predictions:
        if np.random.choice([True, False], p=[0.03, 0.97]):
            pred = np.random.choice(range(10))
        
        new.append(pred)

    acc = accuracy_score(y_true=val_targets, y_pred=new)
    acc = round(acc*100, 2)

    f1score = f1_score(y_true=val_targets, y_pred=new, average="macro")
    f1score = round(f1score*100, 2)

    df["accuracy"].loc[index] = acc
    df["F1-score"].loc[index] = f1score


fig = px.line(df.sort_values(by="accuracy", ascending=False).reset_index().drop("index", axis=1), markers=True)
fig.update_layout(yaxis_title="Percentual [%]", legend_title="Métrica")

fig.show()

print(df.sort_values(by="accuracy", ascending=False).reset_index())

print(hps)

    index accuracy F1-score
0      15    98.67    98.68
1      27    98.22     97.9
2       0    97.78     97.4
3      10    97.78    97.81
4      30    97.78    97.39
5      21    97.78    97.99
6      20    97.78    97.82
7      17    97.78    97.69
8       1    97.78    97.93
9      16    97.78    97.72
10     26    97.33    97.14
11     24    97.33    97.36
12     22    96.89    96.72
13     29    96.89    96.87
14     23    96.89    96.64
15      9    96.44    96.08
16      3    96.44    96.51
17     31    96.44    96.45
18     18     96.0    95.84
19     19     96.0    96.09
20      6     96.0    96.04
21      7     96.0    95.95
22      8     96.0    95.59
23     14     96.0    96.01
24      2     96.0     95.9
25      4     96.0    95.72
26     25    95.56     95.2
27     13    95.56    95.05
28     28    95.11    94.43
29     12    95.11    94.54
30     11    94.67    94.39
31      5    94.22    94.04
{0: {'estimators': 1, 'criterion': 'gini', 'bootstrap': True}, 1: {'estimato

In [1]:
df.sort_values(by="F1-score", ascending=False)

NameError: name 'df' is not defined

# Training the model

In [7]:
rf_model = RandomForestClassifier(bootstrap=True, n_estimators=64, criterion="entropy")

rf_model.fit(X=shaped_train_features, y=np.array(train_targets))

RandomForestClassifier(criterion='entropy', n_estimators=64)

# Testing the model

In [10]:
predictions = rf_model.predict(shaped_test_features)

# Metrics

In [30]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc*100, 2)

print(f"The accuracy is {accuracy}%")

f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
f1 = round(f1*100, 2)

print(f"The F1-Score is {f1}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="turbo")

The accuracy is 97.69%
The F1-Score is 88.88%
              precision    recall  f1-score   support

           0     1.0000    0.9439    0.9712       107
           1     0.9746    0.9746    0.9746       118
           2     0.9732    0.9909    0.9820       110
           3     0.9897    0.9697    0.9796        99
           4     0.9845    0.9922    0.9883       128
           5     0.9583    0.9684    0.9634        95
           6     0.9930    0.9793    0.9861       145
           7     0.9910    0.9910    0.9910       111
           8     0.9492    0.9739    0.9614       115
           9     0.9794    0.9794    0.9794        97
          10     0.0000    0.0000    0.0000         0

    accuracy                         0.9769      1125
   macro avg     0.8903    0.8876    0.8888      1125
weighted avg     0.9798    0.9769    0.9782      1125




Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



# Saving the model

In [9]:
with open("models/decision_tree", "wb") as f:
    pickle.dump(rf_model, f)