## Aim

## Theory

## Code

In [13]:
import numpy as np
import pickle
from sklearn.preprocessing import OneHotEncoder

# --- Load CIFAR10 ---
def load_cifar_batch(path):
    with open(path, 'rb') as f:
        batch = pickle.load(f, encoding='bytes')
        X = batch[b'data'].astype(np.float32) / 255.0
        y = np.array(batch[b'labels']).reshape(-1, 1)
    return X, y

X_train, y_train = load_cifar_batch('cifar/data_batch_1')

enc = OneHotEncoder(sparse_output=False)
Y_train = enc.fit_transform(y_train)

# --- Network ---
class NeuralNet:
    def __init__(self, input_size, hidden_layers, hidden_nodes, output_size, lr=0.1):
        self.lr = lr
        self.layers = []
        prev = input_size
        for _ in range(hidden_layers):
            self.layers.append({
                'W': np.random.randn(prev, hidden_nodes) * 0.01,
                'b': np.zeros((1, hidden_nodes))
            })
            prev = hidden_nodes
        self.out = {
            'W': np.random.randn(prev, output_size) * 0.01,
            'b': np.zeros((1, output_size))
        }

    def relu(self, z): return np.maximum(0, z)
    def relu_deriv(self, z): return (z > 0).astype(float)

    def softmax(self, z):
        z -= np.max(z, axis=1, keepdims=True)
        exp = np.exp(z)
        return exp / np.sum(exp, axis=1, keepdims=True)

    def forward(self, X):
        a = X
        self.cache = [{'a': a, 'z': None}]  # store input as layer 0
        for layer in self.layers:
            z = a @ layer['W'] + layer['b']
            a = self.relu(z)
            self.cache.append({'a': a, 'z': z})
        z_out = a @ self.out['W'] + self.out['b']
        a_out = self.softmax(z_out)
        self.cache.append({'a': a_out, 'z': z_out})
        return a_out

    def backward(self, X, Y, out):
        m = X.shape[0]
        dz = (out - Y) / m

        # output layer gradients
        a_prev = self.cache[-2]['a']
        dW = a_prev.T @ dz
        db = np.sum(dz, axis=0, keepdims=True)
        self.out['W'] -= self.lr * dW
        self.out['b'] -= self.lr * db
        da = dz @ self.out['W'].T

        # hidden layers
        for i in reversed(range(len(self.layers))):
            z_curr = self.cache[i+1]['z']
            a_prev = self.cache[i]['a']
            dz = da * self.relu_deriv(z_curr)
            dW = a_prev.T @ dz
            db = np.sum(dz, axis=0, keepdims=True)
            self.layers[i]['W'] -= self.lr * dW
            self.layers[i]['b'] -= self.lr * db
            da = dz @ self.layers[i]['W'].T

    def loss(self, Y, out):
        return -np.mean(np.sum(Y * np.log(out + 1e-9), axis=1))

# --- Train ---
def train_network(num_layers, num_nodes):
    model = NeuralNet(3072, num_layers, num_nodes, 10, lr=0.1)
    for epoch in range(100):
        out = model.forward(X_train)
        loss = model.loss(Y_train, out)
        model.backward(X_train, Y_train, out)
        if epoch%10==0:
            print(f"Epoch {epoch+1}: loss={loss:.4f}")
    preds = np.argmax(model.forward(X_train), axis=1)
    truth = np.argmax(Y_train, axis=1)
    acc = np.mean(preds == truth)
    print("Accuracy:", acc)

# example
train_network(num_layers=2, num_nodes=128)


Epoch 1: loss=2.3027
Epoch 11: loss=2.3024
Epoch 21: loss=2.3021
Epoch 31: loss=2.3018
Epoch 41: loss=2.3015
Epoch 51: loss=2.3011
Epoch 61: loss=2.3006
Epoch 71: loss=2.2999
Epoch 81: loss=2.2989
Epoch 91: loss=2.2973
Accuracy: 0.1245


In [15]:
import numpy as np
import pickle
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# ---------- Load CIFAR10 ----------
def load_cifar10(path):
    data, labels = [], []
    for i in range(1, 6):
        with open(os.path.join(path, f"data_batch_{i}"), 'rb') as f:
            batch = pickle.load(f, encoding='bytes')
            data.append(batch[b'data'])
            labels += batch[b'labels']
    X = np.concatenate(data).astype(np.float32) / 255.0
    y = np.array(labels).reshape(-1, 1)
    return X, y

def load_cifar10_test(path):
    with open(os.path.join(path, "test_batch"), 'rb') as f:
        batch = pickle.load(f, encoding='bytes')
        X = batch[b'data'].astype(np.float32) / 255.0
        y = np.array(batch[b'labels']).reshape(-1, 1)
    return X, y

# ---------- Network ----------
class ANN:
    def __init__(self, input_dim, hidden_layers, output_dim, lr=0.1):
        self.lr = lr
        self.layers = []
        prev = input_dim
        for h in hidden_layers:
            self.layers.append({
                "W": np.random.randn(prev, h) * 0.01,
                "b": np.zeros((1, h))
            })
            prev = h
        self.out = {
            "W": np.random.randn(prev, output_dim) * 0.01,
            "b": np.zeros((1, output_dim))
        }

    def sigmoid(self, z): 
        return 1 / (1 + np.exp(-z))

    def sigmoid_deriv(self, a): 
        return a * (1 - a)

    def softmax(self, z):
        z -= np.max(z, axis=1, keepdims=True)
        exp = np.exp(z)
        return exp / np.sum(exp, axis=1, keepdims=True)

    def forward(self, X):
        self.cache = [{'a': X}]
        a = X
        for layer in self.layers:
            z = a @ layer['W'] + layer['b']
            a = self.sigmoid(z)
            self.cache.append({'a': a, 'z': z})
        z_out = a @ self.out['W'] + self.out['b']
        a_out = self.softmax(z_out)
        self.cache.append({'a': a_out, 'z': z_out})
        return a_out

    def backward(self, X, Y, out):
        m = X.shape[0]
        dz = (out - Y) / m

        # output layer update
        a_prev = self.cache[-2]['a']
        dW = a_prev.T @ dz
        db = np.sum(dz, axis=0, keepdims=True)
        self.out['W'] -= self.lr * dW
        self.out['b'] -= self.lr * db
        da = dz @ self.out['W'].T

        # hidden layers
        for i in reversed(range(len(self.layers))):
            a_curr = self.cache[i+1]['a']
            a_prev = self.cache[i]['a']
            dz = da * self.sigmoid_deriv(a_curr)
            dW = a_prev.T @ dz
            db = np.sum(dz, axis=0, keepdims=True)
            self.layers[i]['W'] -= self.lr * dW
            self.layers[i]['b'] -= self.lr * db
            da = dz @ self.layers[i]['W'].T

    def loss(self, Y, out):
        return -np.mean(np.sum(Y * np.log(out + 1e-9), axis=1))

    def predict(self, X):
        probs = self.forward(X)
        return np.argmax(probs, axis=1)

# ---------- Training ----------
def train_model(X_train, Y_train, X_test, Y_test, layers, name, epochs=10, lr=0.1):
    model = ANN(3072, layers, 10, lr)
    best_acc = 0
    for epoch in range(epochs):
        out = model.forward(X_train)
        loss = model.loss(Y_train, out)
        model.backward(X_train, Y_train, out)
        preds = model.predict(X_test)
        acc = accuracy_score(np.argmax(Y_test, axis=1), preds)
        print(f"{name} | Epoch {epoch+1}/{epochs} | Loss={loss:.4f} | Acc={acc:.4f}")
        if acc > best_acc:
            best_acc = acc
            np.savez(f"{name}_best_weights.npz",
                     layers=model.layers, out=model.out)
    print(f"Best accuracy for {name}: {best_acc:.4f}")

# ---------- Main ----------
X_train, y_train = load_cifar10("cifar")
X_test, y_test = load_cifar10_test("cifar")

enc = OneHotEncoder(sparse_output=False)
Y_train = enc.fit_transform(y_train)
Y_test = enc.transform(y_test)

# Train both architectures
train_model(X_train, Y_train, X_test, Y_test, [100], "model_1layer", epochs=10, lr=0.1)
train_model(X_train, Y_train, X_test, Y_test, [100, 50, 50], "model_3layer", epochs=10, lr=0.1)


model_1layer | Epoch 1/10 | Loss=2.3033 | Acc=0.0999
model_1layer | Epoch 2/10 | Loss=2.3031 | Acc=0.0993
model_1layer | Epoch 3/10 | Loss=2.3029 | Acc=0.0983
model_1layer | Epoch 4/10 | Loss=2.3027 | Acc=0.1104
model_1layer | Epoch 5/10 | Loss=2.3026 | Acc=0.1209
model_1layer | Epoch 6/10 | Loss=2.3024 | Acc=0.1249
model_1layer | Epoch 7/10 | Loss=2.3023 | Acc=0.1232
model_1layer | Epoch 8/10 | Loss=2.3022 | Acc=0.1215
model_1layer | Epoch 9/10 | Loss=2.3020 | Acc=0.1209
model_1layer | Epoch 10/10 | Loss=2.3019 | Acc=0.1198
Best accuracy for model_1layer: 0.1249
model_3layer | Epoch 1/10 | Loss=2.3030 | Acc=0.1000
model_3layer | Epoch 2/10 | Loss=2.3029 | Acc=0.1000
model_3layer | Epoch 3/10 | Loss=2.3028 | Acc=0.1000
model_3layer | Epoch 4/10 | Loss=2.3027 | Acc=0.1000
model_3layer | Epoch 5/10 | Loss=2.3027 | Acc=0.1000
model_3layer | Epoch 6/10 | Loss=2.3027 | Acc=0.1000
model_3layer | Epoch 7/10 | Loss=2.3027 | Acc=0.1000
model_3layer | Epoch 8/10 | Loss=2.3026 | Acc=0.1000
model_

In [17]:
def train_model(X_train, Y_train, X_test, Y_test, layers, name, epochs=10, lr=0.1):
    model = ANN(3072, layers, 10, lr)
    best_acc = 0
    
    train_losses, test_losses = [], []
    train_accs, test_accs = [], []
    
    for epoch in range(epochs):
        # Forward and backward on training set
        out_train = model.forward(X_train)
        loss_train = model.loss(Y_train, out_train)
        model.backward(X_train, Y_train, out_train)
        
        # Training accuracy
        preds_train = model.predict(X_train)
        acc_train = accuracy_score(np.argmax(Y_train, axis=1), preds_train)
        
        # Test accuracy
        out_test = model.forward(X_test)
        loss_test = model.loss(Y_test, out_test)
        preds_test = np.argmax(out_test, axis=1)
        acc_test = accuracy_score(np.argmax(Y_test, axis=1), preds_test)
        
        # Store metrics
        train_losses.append(loss_train)
        test_losses.append(loss_test)
        train_accs.append(acc_train)
        test_accs.append(acc_test)
        
        print(f"{name} | Epoch {epoch+1}/{epochs} | "
              f"Train Loss={loss_train:.4f}, Train Acc={acc_train:.4f} | "
              f"Test Loss={loss_test:.4f}, Test Acc={acc_test:.4f}")
        
        # Save best model
        if acc_test > best_acc:
            best_acc = acc_test
            np.savez(f"{name}_best_weights.npz",
                     layers=model.layers, out=model.out)

    print(f"Best accuracy for {name}: {best_acc:.4f}")
    
    # Diagnose overfitting/underfitting
    if train_accs[-1] < 0.6 and test_accs[-1] < 0.6:
        print("Model is underfitting")
    elif train_accs[-1] > 0.9 and train_accs[-1] - test_accs[-1] > 0.2:
        print("Model is overfitting")
    else:
        print("Model fit seems good")
    
    # Optionally, return metrics for plotting
    return train_losses, test_losses, train_accs, test_accs


In [19]:
# Train both architectures
train_model(X_train, Y_train, X_test, Y_test, [100], "model_1layer", epochs=10, lr=0.1)
train_model(X_train, Y_train, X_test, Y_test, [100, 50, 50], "model_3layer", epochs=10, lr=0.1)

In [1]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import time

# Preprocess data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model 1: Single hidden layer (100 units)
mlp1 = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', 
                     solver='sgd', learning_rate_init=0.1, max_iter=50, random_state=42)

start = time.time()
mlp1.fit(X_train_scaled, y_train.ravel())  # scikit-learn expects labels as 1D array
end = time.time()
preds1 = mlp1.predict(X_test_scaled)
acc1 = accuracy_score(y_test, preds1)
print(f"Sklearn 1-layer MLP accuracy: {acc1:.4f} | Training time: {end-start:.1f}s")

# Model 2: Three hidden layers (100, 50, 50)
mlp3 = MLPClassifier(hidden_layer_sizes=(100,50,50), activation='logistic', 
                     solver='sgd', learning_rate_init=0.1, max_iter=50, random_state=42)

start = time.time()
mlp3.fit(X_train_scaled, y_train.ravel())
end = time.time()
preds3 = mlp3.predict(X_test_scaled)
acc3 = accuracy_score(y_test, preds3)
print(f"Sklearn 3-layer MLP accuracy: {acc3:.4f} | Training time: {end-start:.1f}s")


NameError: name 'X_train' is not defined

array([6, 9, 9, 4, 1, 1, 2, 7, 0, 3, 4, 7, 7, 2, 9, 9, 9, 3, 3, 6, 4, 3,
       6, 6, 2, 6, 3, 5, 4, 0, 0, 9, 1, 3, 2, 0, 3, 7, 3, 0, 5, 2, 2, 7,
       1, 1, 1, 2, 2, 0, 9, 5, 7, 9, 2, 2, 5, 2, 4, 3, 1, 1, 8, 2, 1, 1,
       4, 9, 7, 8, 5, 9, 6, 7, 4, 3, 9, 0, 3, 1, 3, 5, 4, 5, 7, 7, 4, 7,
       9, 4, 2, 3, 8, 0, 1, 2, 1, 1, 4, 1, 2, 3, 9, 6, 6, 1, 9, 5, 2, 9,
       1, 2, 1, 7, 7, 0, 0, 6, 9, 1, 2, 2, 9, 5, 6, 6, 1, 9, 5, 0, 4, 7,
       6, 7, 1, 8, 1, 1, 2, 8, 1, 6, 3, 6, 2, 4, 9, 9, 5, 4, 3, 6, 7, 2,
       3, 8, 5, 5, 4, 3, 1, 0, 2, 7, 6, 0, 9, 5, 1, 3, 8, 2, 7, 5, 3, 4,
       1, 5, 7, 0, 4, 7, 2, 5, 1, 0, 9, 6, 9, 2, 8, 7, 8, 8, 2, 5, 2, 3,
       5, 0, 6, 1, 9, 3, 6, 9, 1, 3, 9, 6, 6, 7, 1, 0, 9, 5, 8, 5, 2, 9,
       0, 8, 8, 0, 6, 9, 1, 1, 6, 3, 0, 6, 6, 0, 6, 6, 1, 7, 1, 5, 8, 3,
       2, 6, 0, 6, 2, 4, 3, 6, 1, 3, 8, 3, 4, 1, 7, 1, 3, 8, 2, 1, 1, 4,
       0, 9, 3, 7, 4, 9, 9, 4, 0, 9, 9, 1, 0, 5, 9, 0, 8, 2, 1, 2, 2, 3,
       2, 6, 2, 7, 8, 8, 6, 0, 7, 9, 4, 5, 3, 4, 2,

## Results