This is just practice for building a Network in Network (NiN). I am using the FashionMNIST dataset. The point of this project is not to create a strong model, it is to get experience with building a NiN.

In [None]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, TensorDataset

batch_size = 128

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.2860], std=[0.3530])
])


train_dataset = datasets.FashionMNIST(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

test_dataset = datasets.FashionMNIST(
    root='./data',
    train=False,
    download=True,
    transform=transform
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

for X, y in train_loader:
    print("Batch X shape:", X.shape)
    print("Batch y shape:", y.shape)
    break


In [None]:
from torch import nn, optim

def NiN_block(out_channels, kernel_size, stride, padding):
    return nn.Sequential(
        nn.LazyConv2d(out_channels, kernel_size, stride=stride, padding=padding),
        nn.ReLU(),
        nn.LazyConv2d(out_channels, kernel_size=1),
        nn.ReLU(),
        nn.LazyConv2d(out_channels, kernel_size=1),
        nn.ReLU()
    )

class NiN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            NiN_block(32, kernel_size=5, stride=1, padding=2),
            nn.MaxPool2d(2, 2),
            NiN_block(64, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(2, 2),
            NiN_block(128, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(2, 2),
            NiN_block(10, kernel_size=3, stride=1, padding=1),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten()
        )
        
    def forward(self, x):
        return self.net(x)


Running through this would take a long time on my laptop. Instead I am going to train it on Kaggles GPU T4 x2.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = NiN()

# Pass a dummy input through to initialize Lazy layers
dummy_input = torch.randn(1, 1, 28, 28).to(device)
model.to(device)
model(dummy_input)

model = nn.DataParallel(model)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

epochs = 40

for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        
        optimizer.zero_grad()
        yhat = model(xb)
        loss = loss_fn(yhat, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        preds = torch.argmax(yhat, dim=1)
        correct += (preds == yb).sum().item()
        total += xb.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total

    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            yhat = model(xb)
            loss = loss_fn(yhat, yb)
            val_loss += loss.item() * xb.size(0)
            preds = torch.argmax(yhat, dim=1)
            val_correct += (preds == yb).sum().item()
            val_total += xb.size(0)
    
    avg_val_loss = val_loss / val_total
    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch+1}: "
          f"Train Loss: {avg_loss:.4f}, Train Acc: {accuracy:.4f}, "
          f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

![Results](results.png)

A NiN is definitely overkill for Fashion MNIST. Originally I had upscaled the images to 224x224, which is what MNIST architecture expects, but I did not get great results. So I just adjusted the hyperparameters in each layer so it wouldn't downscale the 28x28 image so fast.