# CNN's with PyTorch

Notebook inspired by [Hands-On Machine Learning with Scikit-Learn and PyTorch](https://www.oreilly.com/library/view/hands-on-machine-learning/9798341607972/).

## Setup and Load in Images

In [1]:
import numpy as np
import torch
from sklearn.datasets import load_sample_images

# load and stack images into array
sample_images = np.stack(load_sample_images()['images'])

# turn to tensor and normalize pixel vals
sample_images = torch.tensor(sample_images, dtype=torch.float32) / 255

In [2]:
# 2 images, 427 x 640, 3 color channels
sample_images.shape

torch.Size([2, 427, 640, 3])

In [3]:
# move channel to be just b4 height and width
sample_images_permuted = sample_images.permute(0, 3, 1, 2)

sample_images_permuted.shape

torch.Size([2, 3, 427, 640])

In [4]:
# center-crop images
import torchvision
import torchvision.transforms.v2 as T

cropped_images = T.CenterCrop((70,120))(sample_images_permuted)

cropped_images.shape

torch.Size([2, 3, 70, 120])

## Create 2D Convolutional Layer and Feed it Cropped Images

In [5]:
# conv layer w/ 32 filters, each 7x7
import torch.nn as nn

torch.manual_seed(42)

conv_layer = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=(7,7)) # or could do just 7 for kernel size

# feature maps
fmaps = conv_layer(cropped_images)

# output shape
fmaps.shape

# notice:
# 32 feature maps
# H & W shrunk by 6 pixels (no zero padding)

torch.Size([2, 32, 64, 114])

In [6]:
# if you want it to stay the same, add padding
conv_layer = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=(7,7), padding='same')

# feature maps
fmaps = conv_layer(cropped_images)

# output shape
fmaps.shape

torch.Size([2, 32, 70, 120])

In [7]:
# layer attributes
print(conv_layer.weight.shape) # out chan, in chan, kernel height, kernel width
print(conv_layer.bias.shape) # output channels

torch.Size([32, 3, 7, 7])
torch.Size([32])


## Pooling Layers

In [8]:
# 2x2 pooling layer; default stride of 2, padding = 0 / 'valid'
max_pool = nn.MaxPool2d(kernel_size = (2,2))

In [9]:
# custom depth-wise pooling layer; can allow CNN to learn translation
# invariance
import torch.nn.functional as F

class DepthPool(torch.nn.Module):
  def __init__(self, kernel_size, stride=None, padding=0):
    super().__init__()
    self.kernel_size = kernel_size
    self.stride = stride if stride is not None else kernel_size
    self.padding = padding

  # forward pass
  def forward(self, inputs):
    batch, channels, height, width = inputs.shape
    # merges spatial dimensions
    z = inputs.view(batch, channels, height * width)

    # swap spatial and channel dimensions
    z = z.permute(0,2,1)

    # max pool along last dimension which is the channels
    z = F.max_pool1d(z, kernel_size=self.kernel_size,
                     stride = self.stride, padding = self.padding)

    # move back spatial & channel dimensions
    z = z.permute(0,2,1)

    # unmerge spatial dimensions
    z = z.view(batch, -1, height, width)

    return z

In [10]:
# creating a global average pooling layer;
# computes mean of each entire feature map... very destructive
global_avg_pool = nn.AdaptiveAvgPool2d(output_size=(1,1))

output = global_avg_pool(cropped_images)

output.shape

torch.Size([2, 3, 1, 1])

In [11]:
# or you could do the mean to get the same output for this
output = cropped_images.mean(dim = (2,3), keepdim=True)

output.shape

torch.Size([2, 3, 1, 1])

## CNN Architectures

In [12]:
## basic CNN to tackle Fashion MNIST
from functools import partial

# little wrapper to allow us to reuse default arguments w/o repeating ourselves
DefaultConv2d = partial(nn.Conv2d, kernel_size=3, padding='same')

# basically stacking convolutional layers, ReLU, pooling over and over again,
# until it's time to flatten and proceed through FFN; sprinkle in dropout
# towards the end for regularization
model = nn.Sequential(
    DefaultConv2d(in_channels=1, out_channels=64, kernel_size=7),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    DefaultConv2d(in_channels=64, out_channels=128),
    nn.ReLU(),
    DefaultConv2d(in_channels=128, out_channels=128),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    DefaultConv2d(in_channels=128, out_channels=256),
    nn.ReLU(),
    DefaultConv2d(in_channels=256, out_channels=256),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Flatten(),
    # images start out as 28x28... after pooling operations, they are down
    # to 3x3. Multiply that by the number of feature maps (256) at this point,
    # and you end up with 2304 input features
    nn.Linear(in_features=2304, out_features=128),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(in_features=128, out_features=64),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(in_features=64, out_features=10)
).to(device='cuda')

In [16]:
# set device depending on what's available
if torch.cuda.is_available():
  device = 'cuda'
elif torch.backends.mps.is_available():
  device = 'mps'
else:
  device = 'cpu'

# create tensor object we'll transform FashionMNIST data to
toTensor = T.Compose([T.ToImage(), T.ToDtype(torch.float32, scale = True)])

# bring in train, test, valid data
train_and_valid_data = torchvision.datasets.FashionMNIST(
    root = 'datasets',
    train = True,
    download = True,
    transform = toTensor
)

test_data = torchvision.datasets.FashionMNIST(
    root = 'datasets',
    train = False,
    download = True,
    transform = toTensor
)

# reproducibility
torch.manual_seed(42)

# save back 5_000 from train to be reserved for validation
train_data, valid_data = torch.utils.data.random_split(
    train_and_valid_data,
    [55_000, 5_000]
)

from torch.utils.data import DataLoader

# create data loaders
train_loader = DataLoader(train_data, batch_size = 32, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = 32)
test_loader = DataLoader(test_data, batch_size = 32)

In [14]:
# set model training params
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
n_epochs = 100

In [17]:
# train function to implement mb gd
def train_mbgd(model, optimizer, criterion, train_loader, n_epochs):
  model.train() # set training mode
  for epoch in range(n_epochs):
    total_loss = 0
    for X_batch, y_batch in train_loader:
      # get batch
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      # mod pred
      y_pred = model(X_batch)
      # calc loss and tally
      loss = criterion(y_pred, y_batch)
      total_loss += loss.item()
      # calc grads and do step
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    mean_loss = total_loss / len(train_loader)
    if epoch % 10 == 0: # every ten epochs, print out loss
      print(f'Epoch {epoch + 1}, Loss: {mean_loss}')

In [18]:
model.to(device) # Move model to the correct device
train_mbgd(model, optimizer, criterion, train_loader, n_epochs)

Epoch 1, Loss: 2.302686505431419
Epoch 11, Loss: 0.4731756708974267
Epoch 21, Loss: 0.3492205565824559
Epoch 31, Loss: 0.28776134889726723
Epoch 41, Loss: 0.2467984036987278
Epoch 51, Loss: 0.21463477263826317
Epoch 61, Loss: 0.18125992719248726
Epoch 71, Loss: 0.15336713460035656
Epoch 81, Loss: 0.13641530665820395
Epoch 91, Loss: 0.11388902980094714


In [19]:
## create evaluation function
def evaluate(model, data_loader, metric, aggregate = torch.mean):
  model.eval() # change model mode to evaluation (no gradient work)
  metrics = []

  with torch.no_grad():
    for X_batch, y_batch in data_loader:
      # move data to GPU / cuda
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      y_pred = model(X_batch)
      metric_val = metric(y_pred, y_batch)
      metrics.append(metric_val)

  # retrun agg met over all batches
  return aggregate(torch.stack(metrics))

In [20]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.15.2 torchmetrics-1.8.2


In [21]:
# use accuracy metric to evaluate predictive ability
import torchmetrics
accuracy = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10).to(device)

# accuracy on validation data
# calc batch-wise accuracy w/ lambda func
# get average of batches via aggregate
accuracy_val = evaluate(model, valid_loader,
                        lambda y_pred, y_batch: (y_pred.argmax(dim=1)
                        == y_batch).float().mean(),
                        aggregate = torch.mean)

print(f'Validation Accuracy: {accuracy_val.item()*100:.4f}%')

Validation Accuracy: 91.1027%


## Separable Convolutional Layers

In [22]:
# CNN layer that considers spatial and cross-channel patterns separately
# in practice, uses fewer params, less memory, fewer computations,
# often better performance. Just don't use after layers that have
# not a lot of channels
class SeparableConv2d(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size, stride = 1,
               padding = 0):
    super().__init__()
    # groups = in_channels leads to depthwise convolutional layer
    self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size,
                               stride = stride, padding = padding,
                               groups = in_channels)

    self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size = 1,
                               stride = 1, padding = 0)

    def forward(self, inputs):
      return self.pointwise(self.depthwise(inputs))

## ResNet-34 CNN

In [23]:
# residual unit layer
class ResidualUnit(nn.Module):
  def __init__(self, in_channels, out_channels, stride = 1):
    super().__init__()
    DefaultConv2d = partial(
        nn.Conv2d, kernel_size=3, padding=1, stride = 1, bias=False
    )
    self.main_layers = nn.Sequential(
        DefaultConv2d(in_channels, out_channels, stride = stride),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(),
        DefaultConv2d(out_channels, out_channels),
        nn.BatchNorm2d(out_channels)
    )
    if stride > 1:
      self.skip_connection = nn.Sequential(
          nn.Conv2d(in_channels, out_channels, kernel_size=1,
                    stride = stride, padding = 0),
          nn.BatchNorm2d(out_channels)
      )
    else:
      self.skip_connection = nn.Identity() # identity doesn't do anything; just
                                           # return inputs

  def forward(self, inputs):
    return F.relu(self.main_layers(inputs) + self.skip_connection(inputs))

In [24]:
# building actual resnet model now; with residual unit, this makes it
# easy to build sequential style
class ResNet34(nn.Module):
  def __init__(self):
    super().__init__()
    layers = [
        nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7,
                  stride=2, padding=3, bias=False),
        nn.BatchNorm2d(num_features=64),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    ]

    # first three residual units have 64 filters
    # next 4 have 128 and so forth...
    prev_filters = 64

    for filters in [64] * 3 + [128] * 4 + [256] * 6 + [512] * 3:
      stride = 1 if filters == prev_filters else 2
      layers.append(ResidualUnit(prev_filters, filters, stride=stride))
      prev_filters = filters

    # add RUs to the list of layers
    # lazy linear allows us to not have to figure out shape of input
    layers += [
        nn.AdaptiveAvgPool2d(output_size=(1, 1)),
        nn.Flatten(),
        nn.LazyLinear(10)
    ]

    # create overall sequential model w/ all layers
    self.resnet = nn.Sequential(*layers)

  # forward pass
  def forward(self, x):
    return self.resnet(x)

## Training ResNet-34

In [40]:
# bring in 10-class subset of ImageNet
from torchvision.datasets import Imagenette
from torchvision import transforms

# perform some data augmentation
train_tfms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225)),
])

val_tfms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225)),
])

# load in, perform transformations
train_ds = Imagenette(root="./data", split="train", size="160px",
                      download=True, transform=train_tfms)

val_ds = Imagenette(root="./data", split="val", size="160px",
                      download=True, transform=val_tfms)

model = ResNet34().to(device)

In [41]:
# create data loaders
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True,
                          num_workers=4, pin_memory=True)

val_loader   = DataLoader(val_ds,   batch_size=128, shuffle=False,
                          num_workers=4, pin_memory=True)

In [42]:
# set up training criteria; use label smoothing to help with generalization
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

# using SGD w/ Nesterov momentum, weight decay
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=0.05,
    momentum=0.9,
    weight_decay=1e-4,
    nesterov=True
)

# cosine annealing for 40 epochs; add warm restarts
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
                                                       T_0=40,
                                                       T_mult=2)

In [43]:
# training loop
def train_mbgd(model, optimizer, criterion, train_loader, n_epochs,
               val_loader=None):
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0.0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad(set_to_none=True)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        mean_loss = total_loss / len(train_loader)

        if scheduler is not None:
            scheduler.step()  # advance schedule for next epoch

        if val_loader is not None:
            val_acc = evaluate(
                model,
                val_loader,
                lambda y_pred, y: (y_pred.argmax(dim=1) == y).float().mean(),
                aggregate=torch.mean
            ).item()
            print(f"Epoch {epoch+1:03d}, train loss={mean_loss:.4f}, validation accuracy={val_acc*100:.2f}%")
        else:
            print(f"Epoch {epoch+1:03d}, train loss={mean_loss:.4f}")

In [44]:
# train! :-)
train_mbgd(model, optimizer, criterion, train_loader, n_epochs=40,
           val_loader=val_loader)

Epoch 001, train loss=2.6230, validation accuracy=27.53%
Epoch 002, train loss=2.0726, validation accuracy=38.02%
Epoch 003, train loss=1.9497, validation accuracy=44.88%
Epoch 004, train loss=1.8712, validation accuracy=49.64%
Epoch 005, train loss=1.7805, validation accuracy=49.93%
Epoch 006, train loss=1.7153, validation accuracy=55.65%
Epoch 007, train loss=1.6603, validation accuracy=58.40%
Epoch 008, train loss=1.5619, validation accuracy=56.18%
Epoch 009, train loss=1.5095, validation accuracy=62.73%
Epoch 010, train loss=1.4505, validation accuracy=66.11%
Epoch 011, train loss=1.4096, validation accuracy=68.20%
Epoch 012, train loss=1.3738, validation accuracy=61.73%
Epoch 013, train loss=1.3508, validation accuracy=67.37%
Epoch 014, train loss=1.3241, validation accuracy=67.22%
Epoch 015, train loss=1.2819, validation accuracy=63.81%
Epoch 016, train loss=1.2520, validation accuracy=74.58%
Epoch 017, train loss=1.2422, validation accuracy=75.24%
Epoch 018, train loss=1.2244, v

In [47]:
# validation accuracy
accuracy_val = evaluate(model, val_loader,
                        lambda y_pred, y_batch: (y_pred.argmax(dim=1)
                        == y_batch).float().mean(),
                        aggregate = torch.mean)

print(f'Validation Performance: {accuracy_val.item()*100:.3f}%')

Validation Performance: 83.415%
