# In this notebook a simple Decision Tree will be used to demonstrate the no triviality of the problem

In [1]:
import pickle

import numpy as np
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Loading the data

In [2]:
with open("data/randomized/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [3]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

# Training the model

In [4]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [11]:
dt_model = DecisionTreeClassifier(criterion="entropy")

shaped_train_features = reshape_to_train(unshaped=train_features)

dt_model.fit(X=shaped_train_features, y=np.array(train_targets))

DecisionTreeClassifier(criterion='entropy')

# Testing the model

In [12]:
shaped_test_features = reshape_to_train(unshaped=test_features)

predictions = dt_model.predict(shaped_test_features)

# Metrics

In [13]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc, 4)

print(f"The accuracy is {round(accuracy*100, 4)}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="blues")

The accuracy is 98.81%
              precision    recall  f1-score   support

           0     1.0000    0.9844    0.9921       128
           1     0.9926    0.9926    0.9926       136
           2     0.9852    0.9708    0.9779       137
           3     0.9910    0.9821    0.9865       112
           4     0.9677    1.0000    0.9836       150
           5     0.9735    0.9821    0.9778       112
           6     1.0000    1.0000    1.0000       166
           7     0.9925    1.0000    0.9963       133
           8     0.9862    1.0000    0.9931       143
           9     0.9922    0.9624    0.9771       133

    accuracy                         0.9881      1350
   macro avg     0.9881    0.9875    0.9877      1350
weighted avg     0.9883    0.9881    0.9881      1350



# Saving the model

In [20]:
with open("models/decision_tree", "wb") as f:
    pickle.dump(dt_model, f)