<a href="https://colab.research.google.com/github/Krishna-singh-rajput/AI-AND-ML/blob/main/MLP_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# ---------------------------
# 0) Reproducibility (same result each run)
# ---------------------------
np.random.seed(42)

# ---------------------------
# 1) Load dataset (digits: 1797 samples, each is 8x8 image)
# ---------------------------
digits = load_digits()
X = digits.data          # shape: (N, 64) already flattened 8x8 -> 64
y = digits.target        # shape: (N,)

# ---------------------------
# 2) Train-test split (test set for honest evaluation)
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


# eg: 10000 samples fro training, 2000 images for testing... 5 classes: 500-700. 10 fold stratified sampling
# i trained on 9 folds ... tested on 1 fold : 600 samples for each class
# class 1: 10%, class 2: 20%, class 3: 30%, class 4: 40%, class 5: 50%



SyntaxError: invalid syntax (ipython-input-3615570669.py, line 26)

In [None]:


# ---------------------------
# 3) Feature scaling (VERY important for MLP)
# ---------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # fit only on train (no leakage)
X_test  = scaler.transform(X_test)

# ---------------------------
# 4) One-hot encoding for multiclass cross-entropy
# ---------------------------
num_classes = 10
Y_train = np.eye(num_classes)[y_train]   # shape: (N_train, 10)
Y_test  = np.eye(num_classes)[y_test]

# ---------------------------
# 5) Helper functions (activations + stable softmax)
# ---------------------------
def relu(z):
    return np.maximum(0, z)

def relu_grad(z):
    return (z > 0).astype(float)

def softmax(z):
    # stability trick: subtract max per row (prevents overflow)
    z = z - np.max(z, axis=1, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def cross_entropy(y_true_onehot, y_pred_prob):
    # avoid log(0)
    eps = 1e-12
    y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps)
    # CE = - sum y*log(p) / N
    return -np.mean(np.sum(y_true_onehot * np.log(y_pred_prob), axis=1))

# ---------------------------
# 6) MLP architecture (64 -> hidden -> 10)
# ---------------------------
input_dim  = 64
hidden_dim = 64
output_dim = 10

# He initialization (good for ReLU)
W1 = np.random.randn(input_dim, hidden_dim) * np.sqrt(2 / input_dim)
b1 = np.zeros((1, hidden_dim))
W2 = np.random.randn(hidden_dim, output_dim) * np.sqrt(2 / hidden_dim)
b2 = np.zeros((1, output_dim))

# ---------------------------
# 7) Training hyperparameters
# ---------------------------
lr = 0.1          # learning rate
epochs = 50
batch_size = 64

# ---------------------------
# 8) Training loop (forward -> loss -> backward -> update)
# ---------------------------
N = X_train.shape[0]

for epoch in range(1, epochs + 1):
    # shuffle data each epoch (SGD works better)
    idx = np.random.permutation(N)
    X_train_shuff = X_train[idx]
    Y_train_shuff = Y_train[idx]

    for start in range(0, N, batch_size):
        end = start + batch_size
        Xb = X_train_shuff[start:end]   # mini-batch features
        Yb = Y_train_shuff[start:end]   # mini-batch labels (one-hot)

        # ---- Forward pass ----
        Z1 = Xb @ W1 + b1               # (B,64)@(64,H)->(B,H)
        A1 = relu(Z1)                   # non-linearity
        Z2 = A1 @ W2 + b2               # (B,H)@(H,10)->(B,10)
        P  = softmax(Z2)                # class probabilities

        # ---- Loss (for monitoring) ----
        # (we won't print per batch; too noisy)

        # ---- Backward pass ----
        # For softmax + cross-entropy: dZ2 = (P - Y) / B
        B = Xb.shape[0]
        dZ2 = (P - Yb) / B              # (B,10)

        dW2 = A1.T @ dZ2                # (H,B)@(B,10)->(H,10)
        db2 = np.sum(dZ2, axis=0, keepdims=True)

        dA1 = dZ2 @ W2.T                # (B,10)@(10,H)->(B,H)
        dZ1 = dA1 * relu_grad(Z1)       # chain rule through ReLU

        dW1 = Xb.T @ dZ1                # (64,B)@(B,H)->(64,H)
        db1 = np.sum(dZ1, axis=0, keepdims=True)

        # ---- Parameter update (SGD) ----
        W2 -= lr * dW2
        b2 -= lr * db2
        W1 -= lr * dW1
        b1 -= lr * db1

    # ---- End of epoch: evaluate quickly ----
    Z1t = X_train @ W1 + b1
    A1t = relu(Z1t)
    Z2t = A1t @ W2 + b2
    Pt  = softmax(Z2t)
    train_loss = cross_entropy(Y_train, Pt)
    train_pred = np.argmax(Pt, axis=1)
    train_acc  = accuracy_score(y_train, train_pred)

    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")

# ---------------------------
# 9) Final Test Evaluation
# ---------------------------
Z1 = X_test @ W1 + b1
A1 = relu(Z1)
Z2 = A1 @ W2 + b2
P  = softmax(Z2)
y_pred = np.argmax(P, axis=1)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Epoch 01 | Train Loss: 0.7491 | Train Acc: 0.8219
Epoch 05 | Train Loss: 0.1997 | Train Acc: 0.9631
Epoch 10 | Train Loss: 0.1037 | Train Acc: 0.9847
Epoch 15 | Train Loss: 0.0672 | Train Acc: 0.9916
Epoch 20 | Train Loss: 0.0479 | Train Acc: 0.9951
Epoch 25 | Train Loss: 0.0364 | Train Acc: 0.9972
Epoch 30 | Train Loss: 0.0288 | Train Acc: 0.9986
Epoch 35 | Train Loss: 0.0235 | Train Acc: 0.9993
Epoch 40 | Train Loss: 0.0197 | Train Acc: 0.9993
Epoch 45 | Train Loss: 0.0169 | Train Acc: 1.0000
Epoch 50 | Train Loss: 0.0147 | Train Acc: 1.0000

Test Accuracy: 0.9611111111111111

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99        36
           1       0.88      0.97      0.92        36
           2       0.97      1.00      0.99        35
           3       1.00      0.97      0.99        37
           4       0.92      0.97      0.95        36
           5       1.00      1.00      1.00        37
           

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# -------------------------
# 1) Reproducibility (same results)
# -------------------------
np.random.seed(42)
torch.manual_seed(42)

# -------------------------
# 2) Load digits dataset
# digits.images shape: (N, 8, 8)
# -------------------------
digits = load_digits()
X_img = digits.images          # (N, 8, 8)
y = digits.target              # (N,)

# -------------------------
# 3) Train-test split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_img, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------
# 4) Normalize inputs
# digits pixel values are 0..16
# CNN likes float + scaled values
# -------------------------
X_train = X_train.astype(np.float32) / 16.0
X_test  = X_test.astype(np.float32) / 16.0

# -------------------------
# 5) Add channel dimension
# CNN expects: (batch, channels, height, width)
# Here grayscale => channels=1
# -------------------------
X_train = X_train[:, None, :, :]   # (N, 1, 8, 8)
X_test  = X_test[:, None, :, :]    # (N, 1, 8, 8)

# -------------------------
# 6) Convert to torch tensors
# -------------------------
X_train_t = torch.tensor(X_train)
y_train_t = torch.tensor(y_train, dtype=torch.long)

X_test_t  = torch.tensor(X_test)
y_test_t  = torch.tensor(y_test, dtype=torch.long)

# -------------------------
# 7) DataLoader (mini-batches)
# -------------------------
train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=64, shuffle=True)
test_loader  = DataLoader(TensorDataset(X_test_t, y_test_t), batch_size=128, shuffle=False)

# -------------------------
# 8) CNN Model
# Conv -> ReLU -> Pool -> Conv -> ReLU -> Pool -> Flatten -> FC -> Output(10)
# -------------------------
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.relu  = nn.ReLU()
        self.pool  = nn.MaxPool2d(kernel_size=2, stride=2)

        # After pooling twice:
        # Input 8x8 -> pool -> 4x4 -> pool -> 2x2
        # Channels after conv2 = 32
        # So flattened size = 32 * 2 * 2 = 128
        self.fc1 = nn.Linear(32 * 2 * 2, 64)
        self.fc2 = nn.Linear(64, 10)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))  # (B,16,8,8) -> (B,16,4,4)
        x = self.pool(self.relu(self.conv2(x)))  # (B,32,4,4) -> (B,32,2,2)
        x = x.view(x.size(0), -1)                # flatten -> (B,128)
        x = self.relu(self.fc1(x))               # (B,64)
        x = self.fc2(x)                          # logits (B,10)
        return x

model = SimpleCNN()

# -------------------------
# 9) Loss + Optimizer
# CrossEntropyLoss = Softmax + CE combined (numerically stable)
# -------------------------
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# -------------------------
# 10) Train loop
# -------------------------
epochs = 20
for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0.0

    for xb, yb in train_loader:
        optimizer.zero_grad()          # (why) clear old gradients
        logits = model(xb)             # forward
        loss = criterion(logits, yb)   # compute loss
        loss.backward()                # backprop
        optimizer.step()               # update weights
        total_loss += loss.item()

    # -------------------------
    # 11) Quick evaluation each epoch
    # -------------------------
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():  # (why) no gradients needed during eval
        for xb, yb in test_loader:
            logits = model(xb)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)

    acc = correct / total
    if epoch == 1 or epoch % 5 == 0:
        print(f"Epoch {epoch:02d} | Train Loss: {total_loss/len(train_loader):.4f} | Test Acc: {acc:.4f}")
