In [19]:
import os
import urllib.request
import tarfile
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from skimage.transform import resize

In [20]:
url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
filename = 'cifar-10-python.tar.gz'

if not os.path.exists(filename):
    urllib.request.urlretrieve(url, filename)

if not os.path.exists('cifar-10-batches-py'):
    with tarfile.open(filename, 'r:gz') as tar:
        tar.extractall()

def carica_batch(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding = 'bytes')
        data = dict[b'data']
        labels = dict[b'labels']
        return data, labels

X_train, y_train = [], []
for i in range(1, 6):
    data, labels = carica_batch(f'cifar-10-batches-py/data_batch_{i}')
    X_train.append(data)
    y_train += labels

X_train = np.concatenate(X_train).reshape(-1, 3, 32, 32)
y_train = np.array(y_train)

X_test, y_test = carica_batch('cifar-10-batches-py/test_batch')
X_test = np.array(X_test).reshape(-1, 3, 32, 32)
y_test = np.array(y_test)

def reshape_image(images, new_size=(16,16)):
    reshape = []
    for img in images:
        img = np.transpose(img, (1,2,0))
        img_reshape = resize(img, new_size)
        reshape.append(img_reshape.flatten())
    return np.array(reshape)

sottocampione = 5000
subset = 1000
X_train_new = reshape_image(X_train[:sottocampione])
y_train_new = y_train[:sottocampione]
X_test_new = reshape_image(X_train[:subset])
y_test_new = y_test[:subset]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_new)
X_test_scaled = scaler.fit_transform(X_test_new)

In [21]:
regressione_log = LogisticRegression(max_iter = 5000)
regressione_log.fit(X_train_scaled, y_train_new)
y_prediction_log = regressione_log.predict(X_test_scaled)
print("Accuratezza regressione logistica: ", accuracy_score(y_test_new, y_prediction_log))

Accuratezza regressione logistica:  0.096


In [22]:
param_grid_regressione = {
    'C': [0.01, 0.1, 1, 10],  
    'solver': ['lbfgs'],  
    'max_iter': [3000,5000]
}

grid_regressione = GridSearchCV(LogisticRegression(), param_grid_regressione, cv=5, scoring='accuracy')
grid_regressione.fit(X_train_scaled, y_train_new)

print("Parametri migliori Regressione Logistica: ", grid_regressione.best_params_)
print("migliore accuratezza: ", grid_regressione.best_score_)

Parametri migliori Regressione Logistica:  {'C': 0.01, 'max_iter': 3000, 'solver': 'lbfgs'}
migliore accuratezza:  0.3724


In [23]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_scaled, y_train_new)
y_prediction_knn = knn.predict(X_test_scaled)
print("Accuratezza k-NN: ", accuracy_score(y_test_new, y_prediction_knn))

Accuratezza k-NN:  0.116


In [24]:
param_grid_knn = {
    'n_neighbors':  [3,5,7],
    'weights': ['uniform','distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv = 5, scoring = 'accuracy')
grid_knn.fit(X_train_scaled, y_train_new)

print("Parametri migliori k-NN: ", grid_knn.best_params_)
print("migliore accuratezza: ", grid_knn.best_score_)

Parametri migliori k-NN:  {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
migliore accuratezza:  0.3318


In [None]:
svm = SVC(kernel='rbf')
svm.fit(X_train_scaled, y_train_new)
y_prediction_svm = svm.predict(X_test_scaled)
print("Accuratezza SVM: ", accuracy_score(y_test_new, y_prediction_svm))

In [26]:
param_grid_svm = {
    'kernel': ['linear','poly','rbf'],
    'C': [0.1, 1, 10], 
}

grid_svm = GridSearchCV(SVC(), param_grid_svm ,cv = 5, scoring = 'accuracy')
grid_svm.fit(X_train_scaled, y_train_new)

print("Parametri migliori SVM: ", grid_svm.best_params_)
print("migliore accuratezza: ", grid_svm.best_score_)

Parametri migliori SVM:  {'C': 10, 'kernel': 'rbf'}
migliore accuratezza:  0.4494000000000001


In [27]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train_scaled, y_train_new)
y_prediction_dt = dec_tree.predict(X_test_scaled)
print("Accuratezza Decision Tree: ", accuracy_score(y_test_new, y_prediction_dt))

Accuratezza Decision Tree:  0.105


In [28]:
param_grid_dt = {
    'max_depth': [5,10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4,],
    'criterion': ['entropy']
}

grid_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv = 5, scoring = 'accuracy')
grid_dt.fit(X_train_scaled, y_train_new)

print("Migliori parametri Decision Tree: ", grid_dt.best_params_)
print("migliore accuratezza: ", grid_dt.best_score_)

Migliori parametri Decision Tree:  {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
migliore accuratezza:  0.2454


In [29]:
def plot_confusion(modello, X_test, y_test, nome_modello):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(X_test)
    disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = range(10))
    disp.plot(cmap = 'Blues', xticks_rotation = 45)
    plt.title(f"Matrice di confusione - {nome_modello}")
    plt.show()

best_regressione = grid_regressione.best_estimator_
plot_confusion(best_regressione, X_test_scaled, y_test_new, "Regressione Logistica")

best_svm = grid_svm.best_estimator_
plot_confusion(best_svm, X_test_scaled, y_test_new, "SVM")

best_knn = grid_knn.best_estimator_
plot_confusion(best_knn, X_test_scaled, y_test_new, "k-NN")

best_dt = grid_dt.best_estimator_
plot_confusion(best_tree, X_test_scaled, y_test_new, "Decision Tree")



NameError: name 'model' is not defined