In [22]:
import pickle

import numpy as np
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Loading the data

In [23]:
with open("data/randomized/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [24]:
with open("data/randomized/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [25]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

# Training a baseline model

In [34]:
train_features[0]

[[0.414, 1.0],
 [0.172, 0.829],
 [0.276, 0.632],
 [0.655, 0.526],
 [0.966, 0.461],
 [0.0, 0.566],
 [0.034, 0.303],
 [0.069, 0.158],
 [0.069, 0.039],
 [0.276, 0.566],
 [0.31, 0.303],
 [0.31, 0.132],
 [0.276, 0.0],
 [0.621, 0.592],
 [0.828, 0.421],
 [0.828, 0.5],
 [0.724, 0.579],
 [0.931, 0.645],
 [1.0, 0.513],
 [0.931, 0.566],
 [0.862, 0.618]]

In [35]:
shaped_train_features[0]

array([0.414, 1.   , 0.172, 0.829, 0.276, 0.632, 0.655, 0.526, 0.966,
       0.461, 0.   , 0.566, 0.034, 0.303, 0.069, 0.158, 0.069, 0.039,
       0.276, 0.566, 0.31 , 0.303, 0.31 , 0.132, 0.276, 0.   , 0.621,
       0.592, 0.828, 0.421, 0.828, 0.5  , 0.724, 0.579, 0.931, 0.645,
       1.   , 0.513, 0.931, 0.566, 0.862, 0.618])

In [26]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [27]:
# lr_model = LogisticRegressionCV(solver="saga", cv=5, max_iter=1000)
lr_model = LogisticRegression(max_iter=1000)

shaped_train_features = reshape_to_train(unshaped=train_features)

lr_model.fit(X=shaped_train_features, y=train_targets)

LogisticRegression(max_iter=1000)

# Testing the model

In [28]:
shaped_test_features = reshape_to_train(unshaped=test_features)

predictions = lr_model.predict(shaped_test_features)

# Metrics

In [36]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc, 4)

print(f"The accuracy is {round(accuracy*100, 4)}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="blues")

The accuracy is 99.41%
              precision    recall  f1-score   support

           0     1.0000    0.9922    0.9961       128
           1     0.9927    1.0000    0.9963       136
           2     0.9856    1.0000    0.9928       137
           3     0.9910    0.9821    0.9865       112
           4     1.0000    0.9933    0.9967       150
           5     0.9821    0.9821    0.9821       112
           6     1.0000    1.0000    1.0000       166
           7     1.0000    1.0000    1.0000       133
           8     0.9931    1.0000    0.9965       143
           9     0.9924    0.9850    0.9887       133

    accuracy                         0.9941      1350
   macro avg     0.9937    0.9935    0.9936      1350
weighted avg     0.9941    0.9941    0.9941      1350



In [30]:
%%timeit
lr_model.predict(shaped_test_features)

263 µs ± 46.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Saving the model

In [31]:
with open("models/logisticregression_model", "wb") as f:
    pickle.dump(lr_model, f)