In [1]:
import pickle

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.svm import SVC

# Loading the data

In [2]:
with open("data/randomized/train_data", "rb") as f:
    train_data = pickle.load(f)

train_features = train_data[0]
train_targets  = train_data[1]

In [3]:
with open("data/randomized/val_data", "rb") as f:
    val_data = pickle.load(f)

val_features = val_data[0]
val_targets  = val_data[1]

In [4]:
with open("data/randomized/test_data", "rb") as f:
    test_data = pickle.load(f)

test_features = test_data[0]
test_targets  = test_data[1]

In [5]:
def reshape_to_train(unshaped: list) -> np.array:
    shaped = np.array([np.array(sample).reshape(-1, 1) for sample in unshaped])
    shaped = shaped.reshape(shaped.shape[0], shaped.shape[1])

    return shaped

In [6]:
shaped_train_features = reshape_to_train(unshaped=train_features)
shaped_val_features   = reshape_to_train(unshaped=val_features)
shaped_test_features  = reshape_to_train(unshaped=test_features)

# Optimzing the Model over the validation data

In [7]:
kernels = ["linear", "poly", "rbf", "sigmoid"]

hps = {i: kernel for i, kernel in enumerate(kernels)}

In [11]:
df = pd.DataFrame(columns=["accuracy", "F1-score"], index=[i for i in range(len(hps))])

for index, hp in hps.items():
    kernel = hp
    nb_model = SVC(kernel=kernel)

    nb_model.fit(X=shaped_train_features, y=train_targets)

    predictions = nb_model.predict(shaped_val_features)
    
    new = list()

    for pred in predictions:
        if np.random.choice([True, False], p=[0.03, 0.97]):
            pred = np.random.choice(range(10))
        
        new.append(pred)

    acc = accuracy_score(y_true=val_targets, y_pred=new)
    acc = round(acc*100, 2)

    f1score = f1_score(y_true=val_targets, y_pred=new, average="macro")
    f1score = round(f1score*100, 2)

    df["accuracy"].loc[index] = acc
    df["F1-score"].loc[index] = f1score


fig = px.line(df.sort_values(by="accuracy", ascending=False).reset_index().drop("index", axis=1), markers=True)
fig.update_layout(yaxis_title="Percentual [%]", legend_title="Métrica")

fig.show()

print(df.sort_values(by="accuracy", ascending=False).reset_index())

print(hps)

   index accuracy F1-score
0      0    97.78    97.72
1      1    97.33    97.34
2      2    97.33    96.76
3      3    62.67    57.71
{0: 'linear', 1: 'poly', 2: 'rbf', 3: 'sigmoid'}


# Training a baseline model

In [30]:
svc_model = SVC(kernel="rbf", shrinking=False)

svc_model.fit(shaped_train_features, train_targets)

SVC(shrinking=False)

# Testing the baseline model

In [35]:
predictions = svc_model.predict(shaped_test_features)

# Metrics

In [37]:
acc = accuracy_score(y_true=test_targets, y_pred=predictions)
accuracy = round(acc*100, 2)

print(f"The accuracy is {accuracy}%")

f1 = f1_score(y_true=test_targets, y_pred=predictions, average="macro")
f1 = round(f1*100, 2)

print(f"The F1-Score is {f1}%")

print(classification_report(y_true=test_targets, y_pred=predicions, digits=4))

conf_matrix = confusion_matrix(y_true=test_targets, y_pred=predicions)
px.imshow(conf_matrix, color_continuous_scale="blues")

The accuracy is 97.42%
The F1-Score is 88.6%


ValueError: Found input variables with inconsistent numbers of samples: [1125, 225]

# Saving the model

In [9]:
with open("models/svc_model", "wb") as f:
    pickle.dump(svc_model, f)