In [1]:
import pickle

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

# Loading the data

In [2]:
with open("data/randomized/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [3]:
with open("data/randomized/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [4]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

In [5]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [6]:
shaped_train_features = reshape_to_train(unshaped=train_features)
shaped_val_features   = reshape_to_train(unshaped=val_features)
shaped_test_features  = reshape_to_train(unshaped=test_features)

# Optimzing the Model over the validation data

In [10]:
estimatores = [pow(2, i) for i in range(8)]
criterions = ["gini", "entropy"]
boots = [True, False]

hps = dict()
i = 0

for estimator in estimatores:
    for criterion in criterions:
        for boot in boots:
            hps[i] = {"estimators": estimator, "criterion": criterion, "bootstrap": boot}
            i += 1

In [11]:
df = pd.DataFrame(columns=["accuracy", "F1-score"], index=[i for i in range(len(hps))])

for index, hp in hps.items():    
    lr_model = RandomForestClassifier(bootstrap=hp["bootstrap"], n_estimators=hp["estimators"], criterion=hp["criterion"])

    lr_model.fit(X=shaped_train_features, y=train_targets)

    predictions = lr_model.predict(shaped_val_features)
    
    acc = accuracy_score(y_true=val_targets, y_pred=predictions)
    acc = round(acc*100, 2)

    f1score = f1_score(y_true=val_targets, y_pred=predictions, average="macro")
    f1score = round(f1score*100, 2)

    df["accuracy"].loc[index] = acc
    df["F1-score"].loc[index] = f1score

print(df)

print(hps)

   accuracy F1-score
0     95.11     95.1
1     98.67     89.9
2     97.78    97.75
3     96.44    87.74
4     95.11    86.48
5     98.22    97.75
6     96.89    88.43
7     94.67    85.89
8     96.89    87.81
9     96.44    87.52
10    98.22     89.4
11    97.33    89.37
12    98.67     89.7
13     96.0    88.28
14    97.78    89.19
15    96.44    96.09
16    98.22    98.06
17    97.78     97.9
18     96.0    87.32
19    97.78    88.93
20    96.44     87.4
21    98.22    89.19
22    98.22    97.97
23    97.78    88.81
24    95.56    87.58
25    97.33    97.03
26    98.67    98.44
27    97.78     88.9
28    96.89    88.41
29    96.44    87.48
30    97.78    89.02
31     96.0    87.02
{0: {'estimators': 1, 'criterion': 'gini', 'bootstrap': True}, 1: {'estimators': 1, 'criterion': 'gini', 'bootstrap': False}, 2: {'estimators': 1, 'criterion': 'entropy', 'bootstrap': True}, 3: {'estimators': 1, 'criterion': 'entropy', 'bootstrap': False}, 4: {'estimators': 2, 'criterion': 'gini', 'bootstr

In [1]:
df.sort_values(by="F1-score", ascending=False)

NameError: name 'df' is not defined

# Training the model

In [7]:
rf_model = RandomForestClassifier(bootstrap=True, n_estimators=64, criterion="entropy")

rf_model.fit(X=shaped_train_features, y=np.array(train_targets))

RandomForestClassifier(criterion='entropy', n_estimators=64)

# Testing the model

In [10]:
predictions = rf_model.predict(shaped_test_features)

# Metrics

In [30]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc*100, 2)

print(f"The accuracy is {accuracy}%")

f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
f1 = round(f1*100, 2)

print(f"The F1-Score is {f1}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="turbo")

The accuracy is 97.69%
The F1-Score is 88.88%
              precision    recall  f1-score   support

           0     1.0000    0.9439    0.9712       107
           1     0.9746    0.9746    0.9746       118
           2     0.9732    0.9909    0.9820       110
           3     0.9897    0.9697    0.9796        99
           4     0.9845    0.9922    0.9883       128
           5     0.9583    0.9684    0.9634        95
           6     0.9930    0.9793    0.9861       145
           7     0.9910    0.9910    0.9910       111
           8     0.9492    0.9739    0.9614       115
           9     0.9794    0.9794    0.9794        97
          10     0.0000    0.0000    0.0000         0

    accuracy                         0.9769      1125
   macro avg     0.8903    0.8876    0.8888      1125
weighted avg     0.9798    0.9769    0.9782      1125




Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



# Saving the model

In [9]:
with open("models/decision_tree", "wb") as f:
    pickle.dump(rf_model, f)