In [62]:
import pandas as pd
import numpy as np
from sklearn.metrics import  accuracy_score , f1_score
from time import time

In [63]:
def read_data(trainfile='MINIST_train.csv', validationfile='MNIST_validation.csv'):

    # Load CSVs
    dftrain = pd.read_csv(trainfile)
    dfval = pd.read_csv(validationfile)

    # All pixel columns (784 features)
    featurecols = [col for col in dftrain.columns if col not in ['label', 'even']]

    targetcol = 'label'   # multiclass (0–9)

    # Extract X and y
    Xtrain = dftrain[featurecols].values.astype(float)
    ytrain = dftrain[targetcol].values.astype(int)

    Xval = dfval[featurecols].values.astype(float)
    yval = dfval[targetcol].values.astype(int)

    return Xtrain, ytrain, Xval, yval

SVM algorithm

In [64]:
class MulticlassSVM:
    def __init__(self, learning_rate=0.001, lambda_p=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_p = lambda_p
        self.n_iters = n_iters
        self.W = None     # shape: (K, D)
        self.b = None     # shape: (K,)

    def _binary_svm_update(self, X, y, w, b):
        """
        Performs SGD updates for ONE binary classifier.
        X: (N, D)
        y: (N,) labels in {-1, +1}
        """
        n_samples, n_features = X.shape

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, w) - b) >= 1

                if condition:
                    # only regularization affects gradient
                    w -= self.lr * (self.lambda_p * w)
                else:
                    # hinge loss + regularization
                    w -= self.lr * (self.lambda_p * w - y[idx] * x_i)
                    b -= self.lr * y[idx]

        return w, b

    def fit(self, X, y):
        """
        Train K binary classifiers (one-vs-rest) using the above SGD procedure.
        """
        N, D = X.shape
        classes = np.unique(y)
        K = len(classes)

        # initialize parameters
        self.W = np.zeros((K, D))
        self.b = np.zeros(K)

        # Train each class separately
        for k in classes:
            print(f"Training class {k} vs rest...")

            # Convert labels to +1 (class k) and -1 (all others)
            y_binary = np.where(y == k, 1, -1)

            # Extract w_k, b_k
            w_k = self.W[k].copy()
            b_k = self.b[k].copy()

            # Train binary classifier
            w_k, b_k = self._binary_svm_update(X, y_binary, w_k, b_k)

            # Store results
            self.W[k] = w_k
            self.b[k] = b_k

    def predict(self, X):
        # Compute scores for each class
        scores = X.dot(self.W.T) - self.b
        return np.argmax(scores, axis=1)

In [65]:
# Load your data
Xtrain, ytrain, Xval, yval =  read_data('MNIST_train.csv', 'MNIST_validation.csv')
# Normalize
Xtrain = Xtrain / 255.0
Xval   = Xval / 255.0

svm = MulticlassSVM(
    learning_rate=0.0015,
    lambda_p=0.01,
    n_iters=5        # SGD loops INSIDE classes → keep smaller
)

start_time = time()
svm.fit(Xtrain, ytrain)

end_time = time()
training_time = end_time - start_time

preds = svm.predict(Xval)
acc = np.mean(preds == yval)

f1 = f1_score(yval, preds, average='macro')

print("Validation Accuracy:", acc)
print("Macro F1-score:", f1)
print("Training Time: {:.2f} seconds ({:.2f} minutes)".format(
    training_time, training_time / 60))

Training class 0 vs rest...
Training class 1 vs rest...
Training class 2 vs rest...
Training class 3 vs rest...
Training class 4 vs rest...
Training class 5 vs rest...
Training class 6 vs rest...
Training class 7 vs rest...
Training class 8 vs rest...
Training class 9 vs rest...
Validation Accuracy: 0.8931572629051621
Macro F1-score: 0.8906909496517933
Training Time: 6.67 seconds (0.11 minutes)


Logistic Regression algorithm

In [66]:
class LogisticRegressionMultiClass:
    def __init__(self, lr=0.2, n_epochs=40, mini_batch_size=256):
        self.lr = lr
        self.n_epochs = n_epochs
        self.mini_batch_size = mini_batch_size
        self.mean = None
        self.std = None
        self.weights = None
        self.bias = None
        self.classes = None

    def sigmoid(self, z):
        z = np.clip(z, -50, 50)
        return 1 / (1 + np.exp(-z))

    def standardize(self, X, train=True):
        if train:
            self.mean = X.mean(axis=0)
            self.std = X.std(axis=0) + 1e-8
        return (X - self.mean) / self.std

    def fit(self, X, y):
        # STANDARDIZE ONCE
        X = self.standardize(X, train=True)
        n_samples, n_features = X.shape
        
        # Unique classes 0–9
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # Parameters for each classifier (one-vs-rest)
        self.weights = np.zeros((n_classes, n_features))
        self.bias = np.zeros(n_classes)

        # Train one classifier per digit
        for idx, c in enumerate(self.classes):
            print(f"Training class {c} vs rest")

            y_binary = (y == c).astype(int)

            w = np.zeros(n_features)
            b = 0

            for epoch in range(self.n_epochs):

                # Shuffle indices
                indices = np.arange(n_samples)
                np.random.shuffle(indices)
                X_shuffled = X[indices]
                y_shuffled = y_binary[indices]

                # Mini-batch SGD
                for i in range(0, n_samples, self.mini_batch_size):
                    X_batch = X_shuffled[i:i+self.mini_batch_size]
                    y_batch = y_shuffled[i:i+self.mini_batch_size]

                    z = np.dot(X_batch, w) + b
                    y_pred = self.sigmoid(z)

                    dw = np.dot(X_batch.T, (y_pred - y_batch)) / len(y_batch)
                    db = np.sum(y_pred - y_batch) / len(y_batch)

                    w -= self.lr * dw
                    b -= self.lr * db

            self.weights[idx] = w
            self.bias[idx] = b

    def predict(self, X):
        X = self.standardize(X, train=False)
        logits = np.dot(X, self.weights.T) + self.bias
        probs = self.sigmoid(logits)
        return np.argmax(probs, axis=1)

In [67]:
# Load data
Xtrain, ytrain, Xval, yval = read_data('MNIST_train.csv', 'MNIST_validation.csv')

# Initialize model
model = LogisticRegressionMultiClass(lr=0.20,n_epochs=40,mini_batch_size=256)

start_time = time()
# Train
model.fit(Xtrain, ytrain)

end_time = time()
training_time = end_time - start_time

# Predict
ypred = model.predict(Xval)

# Evaluate


acc = accuracy_score(yval, ypred)
f1 = f1_score(yval, ypred, average='macro')

print("Validation Accuracy =", acc)
print("Macro F1-score =", f1)
print("Training Time: {:.2f} seconds ({:.2f} minutes)".format(
    training_time, training_time / 60)) 

Training class 0 vs rest
Training class 1 vs rest
Training class 2 vs rest
Training class 3 vs rest
Training class 4 vs rest
Training class 5 vs rest
Training class 6 vs rest
Training class 7 vs rest
Training class 8 vs rest
Training class 9 vs rest
Validation Accuracy = 0.8883553421368547
Macro F1-score = 0.8868149537588608
Training Time: 54.21 seconds (0.90 minutes)


KNN algorithm

In [68]:
class KNN:
    def __init__(self, k=5, batch_size=500):
        self.k = k
        self.batch_size = batch_size

    def _normalize(self, X):
        # Scale to 0–1
        X = X.astype(np.float32) / 255.0
        # L2 normalize each sample (very important for cosine distance)
        norms = np.linalg.norm(X, axis=1, keepdims=True) + 1e-6
        return X / norms

    def fit(self, X, y):
        self.X_train = self._normalize(X)
        self.y_train = y

    def predict(self, X):
        X = self._normalize(X)
        n_test = X.shape[0]
        predictions = np.zeros(n_test, dtype=int)

        for i in range(0, n_test, self.batch_size):
            X_batch = X[i:i+self.batch_size]

            # Cosine similarity = dot(x, y)
            sim = np.dot(X_batch, self.X_train.T)

            # Convert similarity to distance
            dists = 1 - sim   # smaller = closer

            # Find top-k nearest neighbors
            k_idx = np.argpartition(dists, self.k, axis=1)[:, :self.k]

            k_labels = self.y_train[k_idx]
            k_dists = dists[np.arange(dists.shape[0])[:, None], k_idx]

            # Weight = 1 / distance (closer neighbor gets more power)
            weights = 1 / (k_dists + 1e-6)

            # Weighted vote
            batch_pred = [
                np.bincount(k_labels[row], weights=weights[row]).argmax()
                for row in range(k_labels.shape[0])
            ]

            predictions[i:i+self.batch_size] = batch_pred

        return predictions

In [69]:
# Load
Xtrain, ytrain, Xval, yval = read_data('MNIST_train.csv', 'MNIST_validation.csv')

# Train (lazy)
model = KNN(k=3)

start_time = time()

model.fit(Xtrain, ytrain)

end_time = time()
training_time = end_time - start_time

# Predict
ypred = model.predict(Xval)

# Evaluate

acc = accuracy_score(yval, ypred)
f1 = f1_score(yval, ypred, average='macro')



print("Accuracy =", acc)
print("Macro F1-score =", f1)
print("Training Time: {:.2f} seconds".format(training_time))

Accuracy = 0.9575830332132853
Macro F1-score = 0.9570664232264271
Training Time: 0.08 seconds
