In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
mnist = fetch_openml('mnist_784', version = 1, cache = True, as_frame = False) 

In [None]:
X = mnist["data"]
y = mnist["target"].astype(np.uint8) 

# Split the data into training and testing sets (already split?)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/7, random_state=42)

In [None]:
# Initializing the models
svm_model = SVC()
knn_model = KNeighborsClassifier()
nb_model = GaussianNB()
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()

# Training the models
svm_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)
nb_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Defining predictions
svm_pred = svm_model.predict(X_test)
knn_pred = knn_model.predict(X_test)
nb_pred = nb_model.predict(X_test)
dt_pred = dt_model.predict(X_test)
rf_pred = rf_model.predict(X_test)

# Defining confusion matrices
svm_cm = confusion_matrix(y_test, svm_pred)
knn_cm = confusion_matrix(y_test, knn_pred)
nb_cm = confusion_matrix(y_test, nb_pred)
dt_cm = confusion_matrix(y_test, dt_pred)
rf_cm = confusion_matrix(y_test, rf_pred)

In [None]:
# Evaluate the performance by calculating accuracy, precision, recall, and F-measure

svm_accuracy = accuracy_score(y_test, svm_pred)
knn_accuracy = accuracy_score(y_test, knn_pred)
nb_accuracy = accuracy_score(y_test, nb_pred)
dt_accuracy = accuracy_score(y_test, dt_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)

svm_precision = precision_score(y_test, svm_pred, average='weighted')
knn_precision = precision_score(y_test, knn_pred, average='weighted')
nb_precision = precision_score(y_test, nb_pred, average='weighted')
dt_precision = precision_score(y_test, dt_pred, average='weighted')
rf_precision = precision_score(y_test, rf_pred, average='weighted')

svm_recall = recall_score(y_test, svm_pred, average='weighted')
knn_recall = recall_score(y_test, knn_pred, average='weighted')
nb_recall = recall_score(y_test, nb_pred, average='weighted')
dt_recall = recall_score(y_test, dt_pred, average='weighted')
rf_recall = recall_score(y_test, rf_pred, average='weighted')

svm_f1 = f1_score(y_test, svm_pred, average='weighted')
knn_f1 = f1_score(y_test, knn_pred, average='weighted')
nb_f1 = f1_score(y_test, nb_pred, average='weighted')
dt_f1 = f1_score(y_test, dt_pred, average='weighted')
rf_f1 = f1_score(y_test, rf_pred, average='weighted')

In [None]:
# Print accuracy, precision, recall, and F1-score for each model

print("Support Vector Machines Accuracy:", svm_accuracy)
print("K-Nearest Neighbors Accuracy:", knn_accuracy)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Decision Tree Accuracy:", dt_accuracy)
print("Random Forest Classifier Accuracy:", rf_accuracy)

print("")

print("Support Vector Machines Precision:", svm_precision)
print("K-Nearest Neighbors Precision:", knn_precision)
print("Naive Bayes Precision:", nb_precision)
print("Decision Tree Precision:", dt_precision)
print("Random Forest Classifier Precision:", rf_precision)

print("")

print("Support Vector Machines Recall:", svm_recall)
print("K-Nearest Neighbors Recall:", knn_recall)
print("Naive Bayes Recall:", nb_recall)
print("Decision Tree Recall:", dt_recall)
print("Random Forest Classifier Recall:", rf_recall)

print("")

print("Support Vector Machines F1-score:", svm_f1)
print("K-Nearest Neighbors F1-score:", knn_f1)
print("Naive Bayes F1-score:", nb_f1)
print("Decision Tree F1-score:", dt_f1)
print("Random Forest Classifier F1-score:", rf_f1)

In [None]:
# Initialize StandardScaler and scale the data

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Test Support Vector Machines again, now with scaled data. Improved performance?

svm_model_with_scaler = SVC()
svm_model_with_scaler.fit(X_train_scaled, y_train)
svm_pred_with_scaler = svm_model_with_scaler.predict(X_test_scaled) 

svm_model_with_scaler_accuracy = accuracy_score(y_test, svm_pred_with_scaler)
svm_model_with_scaler_precision = precision_score(y_test, svm_pred_with_scaler, average='weighted')
svm_model_with_scaler_recall = recall_score(y_test, svm_pred_with_scaler, average='weighted')

# Test K-Nearest Neighbors again, now with scaled data. Improved performance?

knn_model_with_scaler = KNeighborsClassifier()
knn_model_with_scaler.fit(X_train_scaled, y_train)
knn_pred_with_scaler = knn_model_with_scaler.predict(X_test_scaled) 

knn_model_with_scaler_accuracy = accuracy_score(y_test, knn_pred_with_scaler)
knn_model_with_scaler_precision = precision_score(y_test, knn_pred_with_scaler, average='weighted')
knn_model_with_scaler_recall = recall_score(y_test, knn_pred_with_scaler, average='weighted')

In [None]:
print("Support Vector Machines Accuracy, with scaling:", svm_model_with_scaler_accuracy)
print("Support Vector Machines Accuracy, without scaling:", svm_accuracy)
print("")
print("K-Nearest Neighbors Accuracy, with scaling:", knn_model_with_scaler_accuracy) 
print("K-Nearest Neighbors Accuracy, without scaling:", knn_accuracy)
print("")
print("Support Vector Machines Precision, with scaling:", svm_model_with_scaler_precision)
print("Support Vector Machines Precision, without scaling:", svm_precision)
print("")
print("K-Nearest Neighbors Precision, with scaling:", knn_model_with_scaler_precision)
print("K-Nearest Neighbors Precision, without scaling:", knn_precision)
print("")
print("Support Vector Machines Recall, with scaling:", svm_model_with_scaler_recall)
print("Support Vector Machines Recall, without scaling:", svm_recall)
print("")
print("K-Nearest Neighbors Recall, with scaling:", knn_model_with_scaler_recall)
print("K-Nearest Neighbors Recall, without scaling:", knn_recall)

In [None]:
# Define hyperparameter grid for SVM
hyper_param_grid = [
    {'kernel': ['rbf', 'poly'], 'gamma': [1, 2], 'C': [0.5, 1, 1.5]}
]

# Perform grid search with cross-validation for SVM using svm_model
gs_svm = GridSearchCV(svm_model, hyper_param_grid, cv = 3, n_jobs = 3)
gs_svm.fit(X_train_scaled, y_train)

# Predict using the best model found by grid search
y_pred_gs_svm = gs_svm.predict(X_test_scaled)

# Calculate accuracy after tuning hyperparameters
accuracy_gs_svm = accuracy_score(y_test, y_pred_gs_svm)
print("Accuracy after tuning hyperparameters:", accuracy_gs_svm)