# Support Vector Machines
You should build a machine learning pipeline using a support vector machine model. In particular, you should do the following:
- Load the `mnist` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the datasets folder.
- Split the dataset into training and test sets using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
- Conduct data exploration, data preprocessing, and feature engineering if necessary.
- Train and test a support vector machine model using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html).
- Check the documentation to identify the most important hyperparameters, attributes, and methods of the model. Use them in practice.

In [1]:
import pandas as pd

df=pd.read_csv("https://raw.githubusercontent.com/m-mahdavi/teaching/refs/heads/main/datasets/mnist.csv")

df.head()

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,31953,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,34452,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,60897,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,36953,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1981,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
df.describe()

Unnamed: 0,id,class,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,34415.17925,4.4395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.07675,0.01525,0.013,0.0015,0.0,0.0,0.0,0.0,0.0,0.0
std,20508.890104,2.879655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.616022,0.964495,0.822192,0.094868,0.0,0.0,0.0,0.0,0.0,0.0
min,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16575.75,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,34435.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,52111.5,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,69998.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,125.0,61.0,52.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.dtypes

Unnamed: 0,0
id,int64
class,int64
pixel1,int64
pixel2,int64
pixel3,int64
...,...
pixel780,int64
pixel781,int64
pixel782,int64
pixel783,int64


In [4]:
df.isnull().sum()

Unnamed: 0,0
id,0
class,0
pixel1,0
pixel2,0
pixel3,0
...,...
pixel780,0
pixel781,0
pixel782,0
pixel783,0


In [10]:
df.shape

(4000, 786)

In [8]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['id', 'class'])
y = df['class']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (3000, 784)
X_test shape: (1000, 784)
y_train shape: (3000,)
y_test shape: (1000,)


In [12]:
X_train.describe()

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.102333,0.020333,0.017333,0.002,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.020414,1.113703,0.949386,0.109545,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,125.0,61.0,52.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

clf = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0,
                                          shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False,
                                          max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=42))

clf.fit(X_train, y_train)

In [15]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.9090

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97        90
           1       0.95      0.98      0.96       122
           2       0.69      0.92      0.79        92
           3       0.93      0.88      0.91       103
           4       0.93      0.93      0.93       103
           5       0.89      0.90      0.89        86
           6       0.97      0.85      0.91       107
           7       0.98      0.91      0.94        89
           8       0.93      0.88      0.90       114
           9       0.90      0.88      0.89        94

    accuracy                           0.91      1000
   macro avg       0.91      0.91      0.91      1000
weighted avg       0.92      0.91      0.91      1000



In [16]:
clf2 = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='sigmoid', degree=3, gamma='scale', coef0=0.0,
                                          shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False,
                                          max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=42))

clf2.fit(X_train, y_train)

y_pred2 = clf2.predict(X_test)

accuracy = accuracy_score(y_test, y_pred2)
print(f"Model Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred2))

Model Accuracy: 0.9160

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95        90
           1       0.93      0.98      0.95       122
           2       0.84      0.87      0.86        92
           3       0.92      0.91      0.92       103
           4       0.90      0.94      0.92       103
           5       0.85      0.88      0.87        86
           6       0.93      0.92      0.92       107
           7       0.98      0.94      0.96        89
           8       0.95      0.86      0.90       114
           9       0.91      0.88      0.90        94

    accuracy                           0.92      1000
   macro avg       0.92      0.92      0.91      1000
weighted avg       0.92      0.92      0.92      1000



In [17]:
clf3 = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='linear', degree=3, gamma='scale', coef0=0.0,
                                          shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False,
                                          max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=42))

clf3.fit(X_train, y_train)

y_pred3 = clf3.predict(X_test)

accuracy = accuracy_score(y_test, y_pred3)
print(f"Model Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred3))

Model Accuracy: 0.9020

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        90
           1       0.94      0.98      0.96       122
           2       0.81      0.90      0.86        92
           3       0.86      0.88      0.87       103
           4       0.90      0.93      0.91       103
           5       0.81      0.81      0.81        86
           6       0.94      0.90      0.92       107
           7       0.95      0.89      0.92        89
           8       0.94      0.84      0.89       114
           9       0.88      0.87      0.88        94

    accuracy                           0.90      1000
   macro avg       0.90      0.90      0.90      1000
weighted avg       0.90      0.90      0.90      1000



In [18]:
clf4 = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='poly', degree=3, gamma='scale', coef0=0.0,
                                          shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False,
                                          max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=42))

clf4.fit(X_train, y_train)

y_pred4 = clf4.predict(X_test)

accuracy = accuracy_score(y_test, y_pred4)
print(f"Model Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred4))

Model Accuracy: 0.7890

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.82      0.90        90
           1       0.95      0.96      0.96       122
           2       0.90      0.68      0.78        92
           3       0.95      0.76      0.84       103
           4       0.87      0.74      0.80       103
           5       0.93      0.59      0.72        86
           6       1.00      0.75      0.86       107
           7       0.99      0.79      0.88        89
           8       0.40      0.95      0.56       114
           9       0.82      0.77      0.79        94

    accuracy                           0.79      1000
   macro avg       0.88      0.78      0.81      1000
weighted avg       0.87      0.79      0.81      1000



Not using Precomputed kernel as matrix is not squared and returning this error: `Precomputed matrix must be a square matrix. Input is a 3000x784 matrix.`