# Tiny CNN from Scratch — Math, NumPy, PyTorch, Feature Maps & Optuna Tuning

**Colab-ready notebook**: Build a tiny convolutional neural network from first principles, visualize kernels & feature maps, and tune its hyperparameters with Optuna. This notebook is structured for teaching and reproducibility.

**Sections:**
1. Setup (install libraries)
2. Math & numeric example
3. Tiny CNN in NumPy (forward pass only)
4. PyTorch TinyCNN implementation
5. Visualizing kernels & feature maps
6. Optuna hyperparameter tuning (objective + example)
7. Experiments & plots

Run on Google Colab (GPU recommended).

In [None]:
# Install required libraries (run in Colab)
!pip install --quiet torch torchvision optuna matplotlib tqdm

# Note: On Colab, use the GPU runtime for faster training (Runtime → Change runtime type → GPU).

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import optuna
print('torch:', torch.__version__, 'optuna:', optuna.__version__)

## 2. Math & numeric example

Small worked example: apply a 3×3 kernel to a 5×5 input (stride=1, no padding) and compute the output shape.

In [None]:
# Numeric example: 5x5 input convolved with 3x3 kernel
x = np.arange(25).reshape(5,5)
k = np.array([[1,0,-1],[1,0,-1],[1,0,-1]])

def conv2d_simple(x, kernel, stride=1, padding=0):
    if padding>0:
        x = np.pad(x, pad_width=padding, mode='constant')
    k = kernel.shape[0]
    out_h = (x.shape[0] - k)//stride + 1
    out_w = (x.shape[1] - k)//stride + 1
    out = np.zeros((out_h, out_w), dtype=float)
    for i in range(out_h):
        for j in range(out_w):
            patch = x[i*stride:i*stride+k, j*stride:j*stride+k]
            out[i,j] = np.sum(patch * kernel)
    return out

print('input:\n', x)
print('\nkernel:\n', k)
print('\noutput:\n', conv2d_simple(x,k))

## 3. Tiny CNN from scratch (NumPy) — forward pass only

This section implements a minimal forward pass showing what convolution does under the hood.

In [None]:
def relu(x):
    return np.maximum(0, x)

class TinyCNNNumPy:
    def __init__(self):
        # single-channel for simplicity
        self.conv1 = np.random.randn(4, 1, 3, 3) * 0.1  # out_ch, in_ch, k, k
        self.conv2 = np.random.randn(8, 4, 3, 3) * 0.1

    def conv_layer(self, x, kernels, stride=1, padding=0):
        # x shape: (C, H, W)
        C_in, H, W = x.shape
        C_out = kernels.shape[0]
        k = kernels.shape[2]
        if padding>0:
            x = np.pad(x, ((0,0),(padding,padding),(padding,padding)), mode='constant')
            H = x.shape[1]; W = x.shape[2]
        out_h = (H - k)//stride + 1
        out_w = (W - k)//stride + 1
        out = np.zeros((C_out, out_h, out_w))
        for o in range(C_out):
            for i in range(out_h):
                for j in range(out_w):
                    s = 0.0
                    for c in range(C_in):
                        patch = x[c, i*stride:i*stride+k, j*stride:j*stride+k]
                        s += np.sum(patch * kernels[o, c])
                    out[o, i, j] = s
        return out

    def forward(self, x):
        # x: (C, H, W)
        x = self.conv_layer(x, self.conv1, padding=1)
        x = relu(x)
        x = x[:, ::2, ::2]  # simple 2x2 downsample (maxpool replacement for brevity)
        x = self.conv_layer(x, self.conv2, padding=1)
        x = relu(x)
        x = x[:, ::2, ::2]
        return x

# demo with random input
m = TinyCNNNumPy()
inp = np.random.randn(1, 28, 28)
out = m.forward(inp)
print('output shape (numpy tinycnn):', out.shape)

## 4. PyTorch TinyCNN implementation

Standard PyTorch model with two conv layers, pool, and a final linear classifier.

In [None]:
class TinyCNN(nn.Module):
    def __init__(self, in_channels=1, n_classes=10, n_filters1=16, n_filters2=32):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, n_filters1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(n_filters1, n_filters2, kernel_size=3, padding=1)
        self.fc = nn.Linear(n_filters2 * 7 * 7, n_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# quick instantiation
_model = TinyCNN()
print(_model)

### Data: MNIST (small) — transforms and loaders

Use MNIST for quick experiments. If you prefer CIFAR-10, change the dataset and model input channels accordingly.

In [None]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

train_ds = datasets.MNIST('./data', train=True, download=True, transform=transform)
val_ds = datasets.MNIST('./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False, num_workers=2)

print('train samples:', len(train_ds), 'val samples:', len(val_ds))

### Training & evaluation helpers

Functions to train one epoch, evaluate, and a small utility to set seeds for reproducibility.

In [None]:
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed()

def train_one_epoch(model, device, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    correct = 0
    n = 0
    for x,y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
        preds = out.argmax(dim=1)
        correct += (preds==y).sum().item()
        n += x.size(0)
    return total_loss/n, correct/n


def evaluate(model, device, loader, criterion):
    model.eval()
    total_loss = 0.0
    correct = 0
    n = 0
    with torch.no_grad():
        for x,y in loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            loss = criterion(out, y)
            total_loss += loss.item() * x.size(0)
            preds = out.argmax(dim=1)
            correct += (preds==y).sum().item()
            n += x.size(0)
    return total_loss/n, correct/n

print('helpers ready')

## 5. Visualizing kernels & feature maps

Functions to plot filters and feature maps using Matplotlib.

In [None]:
def show_kernels(conv_layer, figsize=(8,8), cmap='viridis'):
    w = conv_layer.weight.data.clone().cpu()
    # Normalize per-kernel
    w_min, w_max = w.min(), w.max()
    w = (w - w_min) / (w_max - w_min + 1e-9)
    out_ch, in_ch, k, _ = w.shape
    cols = in_ch
    rows = out_ch
    fig, axes = plt.subplots(rows, cols, figsize=(cols*2, rows*2))
    if rows==1 and cols==1:
        axes = np.array([[axes]])
    elif rows==1:
        axes = np.array([axes])
    elif cols==1:
        axes = axes.reshape(rows,1)
    for i in range(rows):
        for j in range(cols):
            axes[i,j].imshow(w[i,j], cmap=cmap)
            axes[i,j].axis('off')
    plt.tight_layout()
    plt.show()


def get_feature_map(model, x, layer_name):
    outputs = {}
    def hook(module, inp, out):
        outputs['fm'] = out.detach().cpu()
    handle = getattr(model, layer_name).register_forward_hook(hook)
    _ = model(x)
    handle.remove()
    return outputs.get('fm')


def show_feature_maps(fm, max_channels=8, cmap='magma'):
    # fm shape: (B, C, H, W)
    fm = fm.squeeze(0)
    C = fm.shape[0]
    to_show = min(C, max_channels)
    cols = 4
    rows = (to_show + cols - 1)//cols
    fig, axes = plt.subplots(rows, cols, figsize=(cols*2, rows*2))
    axes = np.array(axes)
    for i in range(rows*cols):
        r, c = divmod(i, cols)
        if i < to_show:
            axes[r,c].imshow(fm[i], cmap=cmap)
            axes[r,c].axis('off')
        else:
            axes[r,c].axis('off')
    plt.tight_layout()
    plt.show()

print('visualization helpers ready')

### Demo: visualize initial kernels and feature maps (random init)

Run this cell to see initial random kernels and feature maps for a sample image.

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TinyCNN().to(device)
# show kernels for conv1
show_kernels(model.conv1)

# pick a sample image
sample, _ = val_ds[0]
x = sample.unsqueeze(0).to(device)
fm1 = get_feature_map(model, x, 'conv1')
show_feature_maps(fm1, max_channels=8)

fm2 = get_feature_map(model, x, 'conv2')
show_feature_maps(fm2, max_channels=8)

## 6. Hyperparameter tuning with Optuna

Define an Optuna objective that trains for a few epochs and reports validation loss for pruning. Adjust search space as needed.

In [None]:
def optuna_objective(trial):
    # hyperparameters
    lr = trial.suggest_loguniform('lr', 1e-4, 1e-1)
    batch = trial.suggest_categorical('batch', [64, 128])
    filters1 = trial.suggest_categorical('filters1', [8, 16, 32])
    filters2 = trial.suggest_categorical('filters2', [16, 32, 64])

    # data loaders with chosen batch
    train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=256, shuffle=False, num_workers=2)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TinyCNN(in_channels=1, n_classes=10, n_filters1=filters1, n_filters2=filters2).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    epochs = 6
    for epoch in range(epochs):
        train_loss, train_acc = train_one_epoch(model, device, train_loader, optimizer, criterion)
        val_loss, val_acc = evaluate(model, device, val_loader, criterion)
        trial.report(val_loss, epoch)
        # pruning
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return val_loss

# Example: create a study (don't run automatically to avoid long runs)
# study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler())
# study.optimize(optuna_objective, n_trials=20)  # adjust n_trials for your budget

print('Optuna objective ready. To run tuning, uncomment the study code and choose n_trials.')

## 7. Experiments & plots

Suggested experiments:
- Run a baseline with fixed hyperparameters for 6 epochs and record val accuracy
- Run Optuna for 20–50 trials (or more if you have budget)
- Compare baseline vs tuned best trial

You can visualize Optuna results using `optuna.visualization` in a Colab environment.