In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.nn.functional as F

# ---------- Hyper-parameters ----------
batch_size = 10
epochs = 20
lr = 0.1                           
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ---------- Data ----------
transform = transforms.Compose([
    transforms.ToTensor(),         # converts [0,255] → float32 in [0,1]
    transforms.Lambda(lambda x: x.view(-1))   # flatten 1×28×28 → 784
])

train_set = datasets.MNIST(root='data', train=True, download=True, transform=transform)
test_set  = datasets.MNIST(root='data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_set,  batch_size=batch_size, shuffle=False)

# ---------- Model ----------
model = nn.Sequential(
    nn.Linear(784, 392),
    nn.ReLU(),
    nn.Linear(392, 196),
    nn.ReLU(),
    nn.Linear(196, 10),
    nn.ReLU()                   
).to(device)

print(model)                       # like model.summary()

# ---------- Loss & Optimizer ----------
criterion = nn.CrossEntropyLoss()           
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

# ---------- Training loop ----------
for epoch in range(1, epochs + 1):
    model.train()
    epoch_loss, correct, total = 0.0, 0, 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        y_onehot = F.one_hot(y, num_classes=10).float()  # to_categorical

        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y_onehot)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * x.size(0)
        preds = outputs.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += x.size(0)

    print(f"Epoch {epoch:2d}/{epochs} "
          f"loss {epoch_loss/total:.4f}  "
          f"acc {correct/total:.4f}")

# ---------- Evaluation ----------
model.eval()
test_loss, correct, total = 0.0, 0, 0
with torch.no_grad():
    for x, y in test_loader:
        x, y = x.to(device), y.to(device)
        y_onehot = F.one_hot(y, num_classes=10).float()
        outputs = model(x)
        test_loss += criterion(outputs, y_onehot).item() * x.size(0)
        preds = outputs.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += x.size(0)

print("\nTest loss:", test_loss/total)
print("Test accuracy:", correct/total)


Sequential(
  (0): Linear(in_features=784, out_features=392, bias=True)
  (1): ReLU()
  (2): Linear(in_features=392, out_features=196, bias=True)
  (3): ReLU()
  (4): Linear(in_features=196, out_features=10, bias=True)
  (5): ReLU()
)
Epoch  1/20 loss 0.3998  acc 0.9130
Epoch  2/20 loss 0.0971  acc 0.9705
Epoch  3/20 loss 0.0650  acc 0.9798
Epoch  4/20 loss 0.0477  acc 0.9849
Epoch  5/20 loss 0.0354  acc 0.9879
Epoch  6/20 loss 0.0282  acc 0.9905
Epoch  7/20 loss 0.0202  acc 0.9933
Epoch  8/20 loss 0.0193  acc 0.9937
Epoch  9/20 loss 0.0126  acc 0.9961
Epoch 10/20 loss 0.0097  acc 0.9969
Epoch 11/20 loss 0.0130  acc 0.9958
Epoch 12/20 loss 0.0124  acc 0.9962
Epoch 13/20 loss 0.0113  acc 0.9965
Epoch 14/20 loss 0.0090  acc 0.9969
Epoch 15/20 loss 0.0085  acc 0.9973
Epoch 16/20 loss 0.0027  acc 0.9994
Epoch 17/20 loss 0.0012  acc 0.9997
Epoch 18/20 loss 0.0004  acc 1.0000
Epoch 19/20 loss 0.0002  acc 1.0000
Epoch 20/20 loss 0.0001  acc 1.0000

Test loss: 0.08203687369408548
Test accuracy