<a href="https://colab.research.google.com/github/Gaju27/mnsit_25k_parameters/blob/main/MNSIT_25k_parameters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchvision

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.optim as optim



In [2]:
class TinyMNISTNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.pool = nn.MaxPool2d(2, 2)
        self.adapt_pool = nn.AdaptiveAvgPool2d((1,1))
        self.fc1 = nn.Linear(64, 23)
        self.fc2 = nn.Linear(23, 10)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.adapt_pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [3]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [4]:
model = TinyMNISTNet()
print("Total trainable parameters:", count_parameters(model))

for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name:25} {param.shape} -> {param.numel()}")

Total trainable parameters: 25255
conv1.weight              torch.Size([16, 1, 3, 3]) -> 144
conv1.bias                torch.Size([16]) -> 16
bn1.weight                torch.Size([16]) -> 16
bn1.bias                  torch.Size([16]) -> 16
conv2.weight              torch.Size([32, 16, 3, 3]) -> 4608
conv2.bias                torch.Size([32]) -> 32
bn2.weight                torch.Size([32]) -> 32
bn2.bias                  torch.Size([32]) -> 32
conv3.weight              torch.Size([64, 32, 3, 3]) -> 18432
conv3.bias                torch.Size([64]) -> 64
bn3.weight                torch.Size([64]) -> 64
bn3.bias                  torch.Size([64]) -> 64
fc1.weight                torch.Size([23, 64]) -> 1472
fc1.bias                  torch.Size([23]) -> 23
fc2.weight                torch.Size([10, 23]) -> 230
fc2.bias                  torch.Size([10]) -> 10


In [5]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

In [15]:
# train_loader = DataLoader(
#     datasets.MNIST('./data', train=True, download=True, transform=transform),
#     batch_size=128, shuffle=True
# )

train_loader = DataLoader(
    datasets.MNIST('./data', train=True, download=True, transform=transform),
    batch_size=64, shuffle=True
)


In [16]:
test_loader = DataLoader(
    datasets.MNIST('./data', train=False, download=True, transform=transform),
    batch_size=1000, shuffle=False
)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TinyMNISTNet().to(device)
print(f"Total trainable params: {count_parameters(model)}")

Total trainable params: 25255


In [22]:
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Optimizer (faster convergence than SGD)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

criterion = nn.CrossEntropyLoss()


In [19]:
def train(model, device, loader, optimizer, criterion):
    model.train()
    correct, total, loss_sum = 0, 0, 0
    for data, target in loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        loss_sum += loss.item() * data.size(0)
        correct += (output.argmax(1) == target).sum().item()
        total += data.size(0)
    return loss_sum / total, correct / total

In [20]:
def test(model, device, loader, criterion):
    model.eval()
    correct, total, loss_sum = 0, 0, 0
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            loss_sum += loss.item() * data.size(0)
            correct += (output.argmax(1) == target).sum().item()
            total += data.size(0)
    return loss_sum / total, correct / total

In [23]:
for epoch in range(1, 6):  # 5 epochs for demo
    train_loss, train_acc = train(model, device, train_loader, optimizer, criterion)
    test_loss, test_acc = test(model, device, test_loader, criterion)
    print(f"Epoch {epoch}: "
          f"Train loss={train_loss:.4f}, acc={train_acc*100:.2f}% | "
          f"Test loss={test_loss:.4f}, acc={test_acc*100:.2f}%")

Epoch 1: Train loss=0.0396, acc=98.83% | Test loss=0.0443, acc=98.53%
Epoch 2: Train loss=0.0347, acc=98.96% | Test loss=0.0403, acc=98.75%
Epoch 3: Train loss=0.0323, acc=99.06% | Test loss=0.0379, acc=98.71%
Epoch 4: Train loss=0.0298, acc=99.12% | Test loss=0.0400, acc=98.55%
Epoch 5: Train loss=0.0269, acc=99.19% | Test loss=0.0361, acc=98.77%
