In [14]:
import pickle

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

# Loading the data

In [15]:
with open("data/randomized/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [16]:
with open("data/randomized/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [17]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

In [18]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [19]:
shaped_train_features = reshape_to_train(unshaped=train_features)
shaped_val_features   = reshape_to_train(unshaped=val_features)
shaped_test_features  = reshape_to_train(unshaped=test_features)

# Optimzing the Model over the validation data

In [20]:
vars_smoothings = {i: pow(10, -i) for i in range(10)}

hps = vars_smoothings

In [31]:
df = pd.DataFrame(columns=["accuracy", "F1-score"], index=[i for i in range(len(hps))])

for index, hp in hps.items():
    nb_model = GaussianNB()

    nb_model.fit(X=shaped_train_features, y=train_targets)

    predictions = nb_model.predict(shaped_val_features)

    acc = accuracy_score(y_true=val_targets, y_pred=predictions)
    acc = round(acc*100, 2)

    f1score = f1_score(y_true=val_targets, y_pred=predictions, average="macro")
    f1score = round(f1score*100, 2)

    df["accuracy"].loc[index] = acc
    df["F1-score"].loc[index] = f1score
    break

print(df)

print(hps)

  accuracy F1-score
0    98.22     89.5
1      NaN      NaN
2      NaN      NaN
3      NaN      NaN
4      NaN      NaN
5      NaN      NaN
6      NaN      NaN
7      NaN      NaN
8      NaN      NaN
9      NaN      NaN
{0: 1, 1: 0.1, 2: 0.01, 3: 0.001, 4: 0.0001, 5: 1e-05, 6: 1e-06, 7: 1e-07, 8: 1e-08, 9: 1e-09}


# Training a baseline model

In [22]:
nb_model = GaussianNB()

nb_model.fit(X=shaped_train_features, y=train_targets)

GaussianNB()

# Testing the model

In [34]:
predictions = nb_model.predict(shaped_test_features)

# Metrics

In [36]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc*100, 2)

print(f"The accuracy is {accuracy}%")

f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
f1 = round(f1*100, 2)

print(f"The F1-Score is {f1}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="blues")

The accuracy is 97.24%
The F1-Score is 88.41%
              precision    recall  f1-score   support

           0     1.0000    0.9813    0.9906       107
           1     0.9829    0.9746    0.9787       118
           2     0.9558    0.9818    0.9686       110
           3     0.9700    0.9798    0.9749        99
           4     0.9919    0.9609    0.9762       128
           5     0.9375    0.9474    0.9424        95
           6     0.9792    0.9724    0.9758       145
           7     0.9640    0.9640    0.9640       111
           8     0.9580    0.9913    0.9744       115
           9     0.9895    0.9691    0.9792        97
          10     0.0000    0.0000    0.0000         0

    accuracy                         0.9724      1125
   macro avg     0.8844    0.8839    0.8841      1125
weighted avg     0.9736    0.9724    0.9729      1125




Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



# Saving the model

In [None]:
with open("models/naivebayes_model", "wb") as f:
    pickle.dump(nb_model, f)