In [1]:
import torch
import torch.nn as nn
from torchinfo import summary


## VGG Network Architechture

-   Inputs are 224x224 images.
-   Convolution kernel shape is (3,3) and max pooling window shape is (2,2).
-   Number of channels for each convolutional layer 64 -> 128 -> 256 -> 512 -> 512.
-   VGG16 has 16 hidden layers (13 convolutional layers and 3 fully connected layers).
-   VGG19 has 19 hidden layers (16 convolutional layers and 3 fully connected layers).


## Key Comparisons

-   VGG (16 or 19 layers) was relatively deeper than other SOTA networks at the time. AlexNet, the winning model for ILSVRC 2012 only has 8 layers.
-   Multiple small (3X3) receptive field filters with ReLU activation instead of one large (7X7 or 11X11) filter lead to better learning of complex features. Smaller filters also mean fewer parameters per layer, with additional nonlinearity introduced in between.
-   Multiscale training and inference. Each image was trained in multiple rounds with varying scales to ensure similar characteristics were captured at different sizes.
-   Consistency and simplicity of the VGG network make it easier to scale or modify for future improvements.


In [24]:
class VGG19(nn.Module):
    def __init__(self, num_classes=1000):
        super(VGG19, self).__init__()

        # Feature extraction layers: Convolutional and pooling layers
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(
                3, 64, kernel_size=3, padding=1
            ),  # 3 input channels, 64 output channels, 3x3 kernel, 1 padding
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(
                kernel_size=2, stride=2
            ),  # Max pooling with 2x2 kernel and stride 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # Fully connected layers for classification
        self.classifier = nn.Sequential(
            nn.Linear(
                512 * 7 * 7, 4096
            ),  # 512 channels, 7x7 spatial dimensions after max pooling
            nn.ReLU(),
            nn.Dropout(0.5),  # Dropout layer with 0.5 dropout probability
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(
                4096, num_classes
            ),  # Output layer with 'num_classes' output units
        )

    def forward(self, x):
        x = self.feature_extractor(x)  # Pass input through the feature extractor layers
        x = x.view(x.size(0), -1)  # Flatten the output for the fully connected layers
        x = self.classifier(x)  # Pass flattened output through the classifier layers
        return x

In [25]:
model = VGG19()

In [26]:
summary(
    model=model,
    input_size=(1, 3, 224, 224),  # (batch_size, color_channels, height, width)
    col_names=["input_size", "output_size", "num_params", "trainable"],
    col_width=20,
    row_settings=["var_names"],
)


Layer (type (var_name))                  Input Shape          Output Shape         Param #              Trainable
VGG19 (VGG19)                            [1, 3, 224, 224]     [1, 1000]            --                   True
├─Sequential (feature_extractor)         [1, 3, 224, 224]     [1, 512, 7, 7]       --                   True
│    └─Conv2d (0)                        [1, 3, 224, 224]     [1, 64, 224, 224]    1,792                True
│    └─ReLU (1)                          [1, 64, 224, 224]    [1, 64, 224, 224]    --                   --
│    └─Conv2d (2)                        [1, 64, 224, 224]    [1, 64, 224, 224]    36,928               True
│    └─ReLU (3)                          [1, 64, 224, 224]    [1, 64, 224, 224]    --                   --
│    └─MaxPool2d (4)                     [1, 64, 224, 224]    [1, 64, 112, 112]    --                   --
│    └─Conv2d (5)                        [1, 64, 112, 112]    [1, 128, 112, 112]   73,856               True
│    └─ReLU (6)     

In [28]:
model(torch.randn((1, 3, 224, 224))).shape

torch.Size([1, 1000])