In [1]:
from torchvision import datasets, transforms
import torch
from torch.utils.data import DataLoader

In [2]:
data = datasets.MNIST(download=True, root="./data/", train=True)

In [46]:
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_split = datasets.MNIST(root="./data/", train=True, transform=transform)
test_split = datasets.MNIST(root="./data/", train=False, transform=transform)

train_loader = DataLoader(train_split, batch_size=256, shuffle=True)
test_loader = DataLoader(test_split, batch_size=64)

In [4]:
import torch.nn as nn

In [27]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2)
        self.logsoftmax = nn.LogSoftmax(dim = 1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        x = self.logsoftmax(x)
        return x

model = Net()
model.to(torch.device("mps"))

Net(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (dropout1): Dropout(p=0.25, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=9216, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
  (relu): ReLU()
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (logsoftmax): LogSoftmax(dim=1)
)

In [6]:
for img, label in train_loader:
    print(model(img).shape)
    break

torch.Size([64, 10])


In [53]:
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR

optimizer = optim.Adam(model.parameters(), lr=0.01)
num_batches = len(train_loader)
schduler = CosineAnnealingLR(optimizer, num_batches * 4.5, 0)
loss_fn = nn.NLLLoss()

In [54]:
model.train()
device = torch.device("mps")
size = len(train_loader.dataset)
for epoch in range(5):
    for batch, (X, y) in enumerate(train_loader):
        X, y = X.to(device), y.to(device)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        schduler.step()
        optimizer.zero_grad()

        if batch % 50 == 0:
            loss, current = loss.item(), batch * 256 + len(X)
            print(f"{epoch}: loss: {loss:>7f}  [{current:>5d}/{size:>5d}] lr: {optimizer.param_groups[-1]['lr']}")


0: loss: 0.089366  [  256/60000] lr: 0.00999997793628279
0: loss: 0.465066  [13056/60000] lr: 0.009942721923902106
0: loss: 0.348949  [25856/60000] lr: 0.00977661137830554
0: loss: 0.249123  [38656/60000] lr: 0.009505304584580038
0: loss: 0.233347  [51456/60000] lr: 0.009134776585393183
1: loss: 0.299617  [  256/60000] lr: 0.008820657446846741
1: loss: 0.148486  [13056/60000] lr: 0.008301263562864149
1: loss: 0.207932  [25856/60000] lr: 0.007709165304714678
1: loss: 0.184065  [38656/60000] lr: 0.007057402568330402
1: loss: 0.231727  [51456/60000] lr: 0.006360329252135888
2: loss: 0.240985  [  256/60000] lr: 0.005853608876153383
2: loss: 0.148642  [13056/60000] lr: 0.005115107291304366
2: loss: 0.244406  [25856/60000] lr: 0.004374070676029285
2: loss: 0.107316  [38656/60000] lr: 0.003646819024876398
2: loss: 0.131690  [51456/60000] lr: 0.0029493687434286055
3: loss: 0.169223  [  256/60000] lr: 0.002487147225501937
3: loss: 0.226475  [13056/60000] lr: 0.0018750910388682395
3: loss: 0.174

In [55]:
model.eval()
size = len(test_loader.dataset)
num_batches = len(test_loader)
test_loss, correct = 0, 0

# Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
# also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
with torch.no_grad():
    for X, y in test_loader:
        X, y = X.to(device), y.to(device)
        pred = model(X)
        test_loss += loss_fn(pred, y).item()
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()

test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

Test Error: 
 Accuracy: 98.4%, Avg loss: 0.050638 

