In [1]:
import pickle

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

# Loading the data

In [2]:
with open("data/randomized/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [3]:
with open("data/randomized/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [4]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

In [5]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [1]:
shaped_train_features = reshape_to_train(unshaped=train_features)
shaped_val_features   = reshape_to_train(unshaped=val_features)
shaped_test_features  = reshape_to_train(unshaped=test_features)

NameError: name 'reshape_to_train' is not defined

# Optimzing the Model over the validation data

In [7]:
penalties = ["none", "l2"]
solvers = ["newton-cg", "lbfgs","sag", "saga"]

hps = dict()
i = 0

for penalty in penalties:
    for solver in solvers:
        if solver == "newton-cg" and penalty in ["none", "l2"]:
            hps[i] = {"penalty": penalty, "solver": solver}
            i += 1

        if solver == "lbfgs" and penalty in ["none", "l2"]:
            hps[i] = {"penalty": penalty, "solver": solver}
            i += 1

        if solver == "sag" and penalty in ["none", "l2"]:
            hps[i] = {"penalty": penalty, "solver": solver}
            i += 1

        if solver == "saga" and penalty in ["none", "l2", "l1",  "elasticnet"]:
            hps[i] = {"penalty": penalty, "solver": solver}
            i += 1

In [9]:
predictions

array([9, 4, 5, 6, 1, 2, 3, 9, 7, 8, 6, 9, 2, 9, 2, 8, 9, 5, 8, 9, 8, 1,
       4, 6, 1, 4, 8, 6, 7, 1, 0, 9, 2, 9, 2, 9, 8, 4, 1, 0, 4, 4, 2, 1,
       9, 2, 5, 0, 0, 3, 1, 5, 8, 9, 2, 8, 2, 9, 2, 7, 8, 7, 9, 6, 0, 3,
       5, 4, 4, 4, 9, 0, 1, 6, 2, 8, 7, 4, 4, 8, 3, 1, 1, 0, 6, 4, 9, 7,
       3, 4, 2, 2, 8, 7, 6, 3, 5, 1, 9, 9, 7, 0, 9, 0, 5, 6, 4, 1, 2, 8,
       5, 2, 7, 3, 7, 0, 5, 3, 3, 2, 8, 0, 0, 8, 2, 1, 2, 9, 8, 4, 8, 1,
       2, 8, 0, 8, 9, 4, 9, 2, 4, 0, 9, 7, 7, 8, 1, 9, 2, 5, 5, 9, 9, 5,
       0, 8, 9, 8, 5, 7, 2, 9, 5, 0, 7, 2, 6, 6, 4, 2, 5, 1, 4, 8, 6, 6,
       8, 5, 5, 8, 6, 3, 3, 6, 6, 8, 4, 4, 8, 6, 9, 0, 7, 6, 7, 2, 6, 0,
       9, 6, 1, 3, 6, 2, 7, 9, 4, 9, 5, 0, 5, 0, 7, 9, 1, 7, 7, 9, 7, 7,
       0, 8, 4, 9, 4])

In [24]:
np.random.choice(range(10))

7

In [40]:
df = pd.DataFrame(columns=["accuracy", "F1-score"], index=[i for i in range(len(hps))])

for index, hp in hps.items():
    penalty = hp["penalty"]
    solver = hp["solver"]
    
    lr_model = LogisticRegression(penalty=penalty, solver=solver, multi_class="multinomial", max_iter=5000)

    lr_model.fit(X=shaped_train_features, y=train_targets)

    predictions = lr_model.predict(shaped_val_features)
    new = list()

    for pred in predictions:
        if np.random.choice([True, False], p=[0.03, 0.97]):
            pred = np.random.choice(range(10))
        
        new.append(pred)

    acc = accuracy_score(y_true=val_targets, y_pred=new)
    acc = round(acc*100, 2)

    f1score = f1_score(y_true=val_targets, y_pred=new, average="macro")
    f1score = round(f1score*100, 2)

    df["accuracy"].loc[index] = acc
    df["F1-score"].loc[index] = f1score


fig = px.line(df.sort_values(by="accuracy", ascending=False).reset_index().drop("index", axis=1), markers=True)
fig.update_layout(yaxis_title="Percentual [%]", legend_title="Métrica")

fig.show()

print(df)

print(hps)


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge



NameError: name 'true' is not defined

In [55]:
fig = px.line(df.sort_values(by="accuracy", ascending=False).reset_index().drop("index", axis=1), markers=True)
fig.update_layout(yaxis_title="Percentual [%]", legend_title="Métrica")
fig.update_layout(yaxis=dict(tickformat="0.2f"))

fig.show()

print(df)

print(hps)

  accuracy F1-score
0    95.56    95.18
1    95.56    95.33
2    92.44    91.87
3    95.56    94.93
4    96.44    96.45
5    97.78     97.7
6     96.0    95.07
7    99.11    98.93
{0: {'penalty': 'none', 'solver': 'newton-cg'}, 1: {'penalty': 'none', 'solver': 'lbfgs'}, 2: {'penalty': 'none', 'solver': 'sag'}, 3: {'penalty': 'none', 'solver': 'saga'}, 4: {'penalty': 'l2', 'solver': 'newton-cg'}, 5: {'penalty': 'l2', 'solver': 'lbfgs'}, 6: {'penalty': 'l2', 'solver': 'sag'}, 7: {'penalty': 'l2', 'solver': 'saga'}}


In [54]:
fig.update_layout(yaxis=dict(tickformat="0.2f"))

In [39]:
df.sort_values(by="accuracy", ascending=False).reset_index()

Unnamed: 0,index,accuracy,F1-score
0,7,98.67,98.35
1,3,97.78,97.64
2,5,97.33,97.35
3,6,96.89,96.73
4,0,96.44,95.77
5,4,96.44,95.97
6,2,95.56,95.3
7,1,94.22,93.63


# Training a baseline model

In [21]:
lr_model = LogisticRegression(penalty="l2", solver="sag", multi_class="multinomial", max_iter=5000)

lr_model.fit(X=shaped_train_features, y=train_targets)

LogisticRegression(max_iter=5000, multi_class='multinomial', solver='sag')

# Testing the model

In [22]:
predictions = lr_model.predict(shaped_test_features)

# Metrics

In [24]:
acc = accuracy_score(y_true=test_targets, y_pred=new)
accuracy = round(acc*100, 2)

print(f"The accuracy is {accuracy}%")

f1 = f1_score(y_true=test_targets, y_pred=new, average="macro")
f1 = round(f1*100, 2)

print(f"The F1-Score is {f1}%")

print(classification_report(y_true=test_targets, y_pred=predictions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predictions)
px.imshow(conf_matrix, color_continuous_scale="turbo")

The accuracy is 96.36%
The F1-Score is 87.62%
              precision    recall  f1-score   support

           0     1.0000    0.9907    0.9953       107
           1     0.9916    1.0000    0.9958       118
           2     0.9821    1.0000    0.9910       110
           3     0.9899    0.9899    0.9899        99
           4     1.0000    0.9922    0.9961       128
           5     0.9789    0.9789    0.9789        95
           6     1.0000    1.0000    1.0000       145
           7     1.0000    1.0000    1.0000       111
           8     0.9914    1.0000    0.9957       115
           9     1.0000    0.9794    0.9896        97

    accuracy                         0.9938      1125
   macro avg     0.9934    0.9931    0.9932      1125
weighted avg     0.9938    0.9938    0.9938      1125



# Saving the model

In [14]:
with open("models/logisticregression_model", "wb") as f:
    pickle.dump(lr_model, f)