## Aim

## Theory

## Code

In [26]:
## libraries
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import os, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import time

In [9]:
## dataset
def load_cifar10(path):
    data, labels = [], []
    for i in range(1, 6):
        with open(os.path.join(path, f"data_batch_{i}"), 'rb') as f:
            batch = pickle.load(f, encoding='bytes')
            data.append(batch[b'data'])
            labels += batch[b'labels']
    X = np.concatenate(data).astype(np.float32) / 255.0
    y = np.array(labels).reshape(-1, 1)
    return X, y

def load_cifar10_test(path):
    with open(os.path.join(path, "test_batch"), 'rb') as f:
        batch = pickle.load(f, encoding='bytes')
        X = batch[b'data'].astype(np.float32) / 255.0
        y = np.array(batch[b'labels']).reshape(-1, 1)
    return X, y

In [10]:
## ANN 
class ANN:
    def __init__(self, input_dim, hidden_layers, output_dim, lr=0.1):
        self.lr = lr
        self.layers = []
        prev = input_dim
        for h in hidden_layers:
            self.layers.append({
                "W": np.random.randn(prev, h) * 0.01,
                "b": np.zeros((1, h))
            })
            prev = h
        self.out = {
            "W": np.random.randn(prev, output_dim) * 0.01,
            "b": np.zeros((1, output_dim))
        }

    def sigmoid(self, z): 
        return 1 / (1 + np.exp(-z))

    def sigmoid_deriv(self, a): 
        return a * (1 - a)

    def softmax(self, z):
        z -= np.max(z, axis=1, keepdims=True)
        exp = np.exp(z)
        return exp / np.sum(exp, axis=1, keepdims=True)

    def forward(self, X):
        self.cache = [{'a': X}]
        a = X
        for layer in self.layers:
            z = a @ layer['W'] + layer['b']
            a = self.sigmoid(z)
            self.cache.append({'a': a, 'z': z})
        z_out = a @ self.out['W'] + self.out['b']
        a_out = self.softmax(z_out)
        self.cache.append({'a': a_out, 'z': z_out})
        return a_out

    def backward(self, X, Y, out):
        m = X.shape[0]
        dz = (out - Y) / m

        # output layer update
        a_prev = self.cache[-2]['a']
        dW = a_prev.T @ dz
        db = np.sum(dz, axis=0, keepdims=True)
        self.out['W'] -= self.lr * dW
        self.out['b'] -= self.lr * db
        da = dz @ self.out['W'].T

        # hidden layers
        for i in reversed(range(len(self.layers))):
            a_curr = self.cache[i+1]['a']
            a_prev = self.cache[i]['a']
            dz = da * self.sigmoid_deriv(a_curr)
            dW = a_prev.T @ dz
            db = np.sum(dz, axis=0, keepdims=True)
            self.layers[i]['W'] -= self.lr * dW
            self.layers[i]['b'] -= self.lr * db
            da = dz @ self.layers[i]['W'].T

    def loss(self, Y, out):
        return -np.mean(np.sum(Y * np.log(out + 1e-9), axis=1))

    def predict(self, X):
        probs = self.forward(X)
        return np.argmax(probs, axis=1)




In [23]:
class ANN_r:
    def __init__(self, input_dim, hidden_layers, output_dim, lr=0.1):
        self.lr = lr
        self.layers = []
        prev = input_dim
        for h in hidden_layers:
            self.layers.append({
                "W": np.random.randn(prev, h) * np.sqrt(2.0 / prev),  # He init for ReLU
                "b": np.zeros((1, h))
            })
            prev = h
        self.out = {
            "W": np.random.randn(prev, output_dim) * 0.01,
            "b": np.zeros((1, output_dim))
        }

    # ReLU activation and derivative
    def relu(self, z):
        return np.maximum(0, z)

    def relu_deriv(self, z):
        return (z > 0).astype(float)

    def softmax(self, z):
        z -= np.max(z, axis=1, keepdims=True)
        exp = np.exp(z)
        return exp / np.sum(exp, axis=1, keepdims=True)

    def forward(self, X):
        self.cache = [{'a': X}]
        a = X
        for layer in self.layers:
            z = a @ layer['W'] + layer['b']
            a = self.relu(z)
            self.cache.append({'a': a, 'z': z})
        z_out = a @ self.out['W'] + self.out['b']
        a_out = self.softmax(z_out)
        self.cache.append({'a': a_out, 'z': z_out})
        return a_out

    def backward(self, X, Y, out):
        m = X.shape[0]
        dz = (out - Y) / m

        # output layer update
        a_prev = self.cache[-2]['a']
        dW = a_prev.T @ dz
        db = np.sum(dz, axis=0, keepdims=True)
        self.out['W'] -= self.lr * dW
        self.out['b'] -= self.lr * db
        da = dz @ self.out['W'].T

        # hidden layers backward
        for i in reversed(range(len(self.layers))):
            z_curr = self.cache[i+1]['z']
            a_prev = self.cache[i]['a']
            dz = da * self.relu_deriv(z_curr)
            dW = a_prev.T @ dz
            db = np.sum(dz, axis=0, keepdims=True)
            self.layers[i]['W'] -= self.lr * dW
            self.layers[i]['b'] -= self.lr * db
            da = dz @ self.layers[i]['W'].T

    def loss(self, Y, out):
        return -np.mean(np.sum(Y * np.log(out + 1e-9), axis=1))

    def predict(self, X):
        probs = self.forward(X)
        return np.argmax(probs, axis=1)


In [17]:
def train_model(X_train, Y_train, X_test, Y_test, layers, name, epochs=10, lr=0.1):
    model = ANN(3072, layers, 10, lr)
    best_acc = 0
    for epoch in range(epochs):
        out = model.forward(X_train)
        loss = model.loss(Y_train, out)
        model.backward(X_train, Y_train, out)
        preds = model.predict(X_test)
        acc = accuracy_score(np.argmax(Y_test, axis=1), preds)
        print(f"{name} | Epoch {epoch+1}/{epochs} | Loss={loss:.4f} | Acc={acc:.4f}")
        if acc > best_acc:
            best_acc = acc
            np.savez(f"{name}_best_weights.npz",
                     layers=model.layers, out=model.out)
    print(f"Best accuracy for {name}: {best_acc:.4f}")
    return model



In [24]:
def train_model_r(X_train, Y_train, X_test, Y_test, layers, name, epochs=10, lr=0.1):
    model = ANN_r(3072, layers, 10, lr)
    best_acc = 0
    for epoch in range(epochs):
        out = model.forward(X_train)
        loss = model.loss(Y_train, out)
        model.backward(X_train, Y_train, out)
        preds = model.predict(X_test)
        acc = accuracy_score(np.argmax(Y_test, axis=1), preds)
        print(f"{name} | Epoch {epoch+1}/{epochs} | Loss={loss:.4f} | Acc={acc:.4f}")
        if acc > best_acc:
            best_acc = acc
            np.savez(f"{name}_best_weights.npz",
                     layers=model.layers, out=model.out)
    print(f"Best accuracy for {name}: {best_acc:.4f}")
    return model



In [12]:
X_train, y_train = load_cifar10("cifar")
X_test, y_test = load_cifar10_test("cifar")

enc = OneHotEncoder(sparse_output=False)
Y_train = enc.fit_transform(y_train)
Y_test = enc.transform(y_test)



In [22]:
def diagnose_model(model, X_train, Y_train, X_val, Y_val):
    # Forward passes
    out_train = model.forward(X_train)
    out_val = model.forward(X_val)

    # Compute losses
    train_loss = model.loss(Y_train, out_train)
    val_loss = model.loss(Y_val, out_val)

    # Compute accuracies
    train_pred = np.argmax(out_train, axis=1)
    val_pred = np.argmax(out_val, axis=1)
    y_train_true = np.argmax(Y_train, axis=1)
    y_val_true = np.argmax(Y_val, axis=1)

    train_acc = np.mean(train_pred == y_train_true)
    val_acc = np.mean(val_pred == y_val_true)

    print(f"Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")
    print(f"Train acc: {train_acc:.4f}, Val acc: {val_acc:.4f}")

    # Diagnose
    if train_acc > 0.9 and val_acc < 0.7:
        print("Diagnosis: Overfitting")
    elif train_acc < 0.7 and val_acc < 0.7:
        print("Diagnosis: Underfitting")
    else:
        print("Diagnosis: Good fit")

# Example usage after training
diagnose_model(model, X_train, Y_train, X_test, Y_test)

Train loss: 2.3032, Val loss: 2.3032
Train acc: 0.1000, Val acc: 0.1000
Diagnosis: Underfitting


In [18]:
model=train_model(X_train, Y_train, X_test, Y_test, [100], "model_1layer", epochs=1, lr=0.1)

model_1layer | Epoch 1/1 | Loss=2.3038 | Acc=0.1000
Best accuracy for model_1layer: 0.1000


In [25]:
model_r=train_model_r(X_train, Y_train, X_test, Y_test, [100], "model_1layer", epochs=1, lr=0.1)

model_1layer | Epoch 1/1 | Loss=2.3024 | Acc=0.1081
Best accuracy for model_1layer: 0.1081


In [None]:
train_model(X_train, Y_train, X_test, Y_test, [100, 50, 50], "model_3layer", epochs=80, lr=0.1)

In [14]:


# Preprocess data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model 1: Single hidden layer (100 units)
mlp1 = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', 
                     solver='sgd', learning_rate_init=0.1, max_iter=50, random_state=42)

start = time.time()
mlp1.fit(X_train_scaled, y_train.ravel())  # scikit-learn expects labels as 1D array
end = time.time()
preds1 = mlp1.predict(X_test_scaled)
acc1 = accuracy_score(y_test, preds1)
print(f"Sklearn 1-layer MLP accuracy: {acc1:.4f} | Training time: {end-start:.1f}s")

# Model 2: Three hidden layers (100, 50, 50)
mlp3 = MLPClassifier(hidden_layer_sizes=(100,50,50), activation='logistic', 
                     solver='sgd', learning_rate_init=0.1, max_iter=50, random_state=42)

start = time.time()
mlp3.fit(X_train_scaled, y_train.ravel())
end = time.time()
preds3 = mlp3.predict(X_test_scaled)
acc3 = accuracy_score(y_test, preds3)
print(f"Sklearn 3-layer MLP accuracy: {acc3:.4f} | Training time: {end-start:.1f}s")




Sklearn 1-layer MLP accuracy: 0.4147 | Training time: 60.0s
Sklearn 3-layer MLP accuracy: 0.4409 | Training time: 81.3s




## Results