In [None]:
import torch
from torch import nn
import torchsummary
import torchvision

torch.__version__

'2.1.0+cu118'

#### AlexNet:
https://pytorch.org/vision/stable/_modules/torchvision/models/alexnet.html#alexnet

In [None]:
class AlexNet(nn.Module):
    def __init__(self, num_classes=1000):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.flatten = nn.Flatten()
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.flatten(x)
        x = self.classifier(x)
        return x

In [None]:
model = AlexNet()

In [None]:
print(model)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU()
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Li

In [None]:
torchsummary.summary(model, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 55, 55]          23,296
              ReLU-2           [-1, 64, 55, 55]               0
         MaxPool2d-3           [-1, 64, 27, 27]               0
            Conv2d-4          [-1, 192, 27, 27]         307,392
              ReLU-5          [-1, 192, 27, 27]               0
         MaxPool2d-6          [-1, 192, 13, 13]               0
            Conv2d-7          [-1, 384, 13, 13]         663,936
              ReLU-8          [-1, 384, 13, 13]               0
            Conv2d-9          [-1, 256, 13, 13]         884,992
             ReLU-10          [-1, 256, 13, 13]               0
           Conv2d-11          [-1, 256, 13, 13]         590,080
             ReLU-12          [-1, 256, 13, 13]               0
        MaxPool2d-13            [-1, 256, 6, 6]               0
          Flatten-14                 [-

#### VGG
https://pytorch.org/vision/stable/_modules/torchvision/models/vgg.html

In [None]:
class VGG(nn.Module):
    def __init__(self, num_classes=1000):
        super().__init__()
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.flatten = nn.Flatten()
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.flatten(x)
        x = self.classifier(x)
        return x

class VGG2(nn.Module):
    def __init__(self, num_classes=1000):
        super().__init__()
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.flatten = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
        )
        self.classifier = nn.Sequential(
            nn.Linear(512, 4096),
            nn.ReLU(),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.flatten(x)
        x = self.classifier(x)
        return x

In [None]:
model = VGG() # Flatten
model2 = VGG2() # GAP

In [None]:
print(model)

In [None]:
torchsummary.summary(model, (3, 224, 224))

In [None]:
torchsummary.summary(model2, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           1,792
              ReLU-2         [-1, 64, 224, 224]               0
            Conv2d-3         [-1, 64, 224, 224]          36,928
              ReLU-4         [-1, 64, 224, 224]               0
         MaxPool2d-5         [-1, 64, 112, 112]               0
            Conv2d-6        [-1, 128, 112, 112]          73,856
              ReLU-7        [-1, 128, 112, 112]               0
            Conv2d-8        [-1, 128, 112, 112]         147,584
              ReLU-9        [-1, 128, 112, 112]               0
        MaxPool2d-10          [-1, 128, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]         295,168
             ReLU-12          [-1, 256, 56, 56]               0
           Conv2d-13          [-1, 256, 56, 56]         590,080
             ReLU-14          [-1, 256,

#### Network in Network: Inception

In [None]:
# Basic Conv block
class BasicConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size,
                 stride=1,
                 padding=0,
                 bn=False,
                 activation=True):
        super().__init__()
        self.bn = nn.BatchNorm2d(num_features=out_channels) if bn else None
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size,
                              stride=stride,
                              padding=padding)
        self.activation = nn.ReLU() if activation else None
    def forward(self, x):
        x = self.conv(x)
        if self.bn:
            x = self.bn(x)
        if self.activation:
            return self.activation(x)
        else:
            return x

# InceptionV1 module
class Inception(nn.Module):
    def __init__(
        self,
        in_channels,
        ch1x1,
        ch3x3red, ch3x3,
        ch5x5red, ch5x5,
        pool_proj):
        super().__init__()
        # branch 1x1
        self.branch1 = BasicConv2d(in_channels, ch1x1, kernel_size=1)
        # branch 3x3
        self.branch2 = nn.Sequential(
            BasicConv2d(in_channels, ch3x3red, kernel_size=1), # reduce chanels by 1x1 conv
            BasicConv2d(ch3x3red, ch3x3, kernel_size=3, padding=1)
        )
        # branch 5x5
        self.branch3 = nn.Sequential(
            BasicConv2d(in_channels, ch5x5red, kernel_size=1), # reduce chanels by 1x1 conv
            BasicConv2d(ch5x5red, ch5x5, kernel_size=5, padding=2)
        )
        # branch Pool
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True),
            BasicConv2d(in_channels, pool_proj, kernel_size=1)
        )

    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4(x)
        # concatenate feature dims
        outputs = torch.cat([branch1, branch2, branch3, branch4], dim=1)
        return outputs

#### GoogLeNet (Inception v1)

https://github.com/pytorch/vision/blob/master/torchvision/models/googlenet.py

In [None]:
model = Inception(in_channels=192,
                  ch1x1=64,
                  ch3x3red=96, ch3x3=128,
                  ch5x5red=16, ch5x5=32,
                  pool_proj=32)

In [None]:
torchsummary.summary(model, (192, 64, 64))

In [None]:
print('\nInception block output: ', model(torch.rand(1, 192, 64, 64)).shape)

# ResNet
https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

https://pytorch.org/hub/pytorch_vision_resnet/


In [None]:
class IdentityBlock(nn.Module):
    def __init__(self, in_dims, out_dims, kernel_size, stride=1):
        super().__init__()
        filters1, filters2, filters3 = out_dims
        self.conv1 = BasicConv2d(in_dims, filters1, kernel_size=1, stride=stride,
                                 bn=True)
        self.conv2 = BasicConv2d(filters1, filters2, kernel_size, padding='same', bn=True)
        self.conv3 = BasicConv2d(filters2, filters3, kernel_size=1, bn=True, activation=False)
        self.relu = nn.ReLU()

    def forward(self, x):
        shortcut = x
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x += shortcut # Add (x, shortcut) tensor
        x = self.relu(x)
        return x

class ConvBlock(nn.Module):
    def __init__(self, in_dims, out_dims, kernel_size, stride=1):
        super().__init__()
        filters1, filters2, filters3 = out_dims
        self.conv1 = BasicConv2d(in_dims, filters1, kernel_size=1, stride=stride,
                                 bn=True)
        self.conv2 = BasicConv2d(filters1, filters2, kernel_size, padding='same', bn=True)
        self.conv3 = BasicConv2d(filters2, filters3, kernel_size=1, bn=True, activation=False)
        self.relu = nn.ReLU()
        self.skip_branch = BasicConv2d(in_dims, filters3, kernel_size=1,
                                       stride=stride,
                                       bn=True, activation=False)
    def forward(self, x):
        shortcut = self.skip_branch(x)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x += shortcut # Add (x, shortcut) tensor
        x = self.relu(x)
        return x

In [None]:
# The identity block is the block that has no conv layer at shortcut.
resblock1 = IdentityBlock(in_dims=256, out_dims=[64, 64, 256], kernel_size=3, stride=1)
torchsummary.summary(resblock1, (256, 64, 64))

In [None]:
# A block that has a conv layer at shortcut.
resblock2 = ConvBlock(in_dims=256, out_dims=[64, 64, 256], kernel_size=3, stride=2)
torchsummary.summary(resblock2, (256, 64, 64))

In [None]:
class ResNetSimple(nn.Module):
    def __init__(self, num_classes=1000):
        super().__init__()
        self.features = nn.Sequential(
            nn.ConstantPad2d(padding=3, value=0),
            BasicConv2d(3, 64, kernel_size=7, stride=2, bn=True),
            nn.ConstantPad2d(padding=1, value=0),
            nn.MaxPool2d(kernel_size=3, stride=2),
            # S1
            ConvBlock(in_dims=64, out_dims=[64, 64, 256], kernel_size=3),
            IdentityBlock(256, [64, 64, 256], 3),
            IdentityBlock(256, [64, 64, 256], 3),
            # S2
            ConvBlock(256, [128, 128, 512], 3, stride=2),
            IdentityBlock(512, [128, 128, 512], 3),
            IdentityBlock(512, [128, 128, 512], 3),
            IdentityBlock(512, [128, 128, 512], 3),
            IdentityBlock(512, [128, 128, 512], 3),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.features(x) # (bs, 512, 7, 7)
        x = self.avgpool(x) # (bs, 512, 1, 1)
        x = torch.flatten(x, 1) # (bs, 512)
        x = self.classifier(x)
        return x

In [None]:
model = ResNetSimple()

In [None]:
torchsummary.summary(model, (3, 224, 224))

# SENet

In [None]:
class SEModule(nn.Module):
	def __init__(self, cin, ratio=16):
		super().__init__()
		cout = int(cin / ratio)
		self.gate = nn.Sequential(
			nn.Conv2d(cin, cout, kernel_size=1),
			nn.ReLU(),
			nn.Conv2d(cout, cin, kernel_size=1),
			nn.Sigmoid(),
		)
	def forward(self, inputs):
		x = inputs.mean((2, 3), keepdim=True)
		x = self.gate(x)
		return inputs * x

#### [torchvision.models](https://pytorch.org/vision/stable/models.html) : Classification

*   AlexNet
*   VGG
*   ResNet
*   SqueezeNet
*   DenseNet
*   Inception v3
*   GoogLeNet (InceptionV1)
*   ShuffleNet v2
*   MobileNetV2
*   MobileNetV3
*   ResNeXt
*   Wide ResNet
*   MNASNet


### Data Preprocess

All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel **RGB** images of shape **(3 x H x W)**, where H and W are expected to be **at least 224**. The images have to be loaded in to a range of **[0, 1]** and then normalized using **mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]**. You can use the following transform to normalize:

In [None]:
model = torchvision.models.resnet50()
# model = torchvision.models.convnext_tiny()

In [None]:
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
from torchvision import transforms
from torchvision.models import resnet50, ResNet50_Weights

normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225]
)

# Initialize the Weight Transforms
weights = ResNet50_Weights.DEFAULT
model = torchvision.models.resnet50(weights=ResNet50_Weights.DEFAULT)
preprocess = weights.transforms()

# Apply it to the input image
# img_transformed = preprocess(img)

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:02<00:00, 49.9MB/s]


In [None]:
print(model)

#### Replace FC layer

ResNet50

In [None]:
print(model.fc)

Linear(in_features=2048, out_features=1000, bias=True)


In [None]:
model.fc = nn.Linear(2048, 5)
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
model.fc = nn.Sequential(
    nn.Linear(2048, 128),
    nn.ReLU(),
    nn.Linear(128, 5)
)

In [None]:
inputs = torch.randn(2, 3, 224, 224)
outputs = model(inputs)
print(outputs.shape)

torch.Size([2, 5])


ConvNeXt

In [None]:
model.classifier = nn.Sequential(
    nn.Flatten(),
    nn.Linear(in_features=768, out_features=6),
)

In [None]:
# model.fc = nn.Linear(in_features=512, out_features=6)
print(model)

#### Model summary

In [None]:
torchsummary.summary(model, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

#### Simple inference

In [None]:
from torchvision.io import read_image
from torchvision.models import resnet50, ResNet50_Weights

img = read_image("test.jpg")

# Step 1: Initialize model with the best available weights
weights = ResNet50_Weights.DEFAULT
model = resnet50(weights=weights)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# Step 3: Apply inference preprocessing transforms
batch = preprocess(img).unsqueeze(0)

# Step 4: Use the model and print the predicted category
prediction = model(batch).squeeze(0).softmax(0)
class_id = prediction.argmax().item()
score = prediction[class_id].item()
category_name = weights.meta["categories"][class_id]
print(f"{category_name}: {100 * score:.1f}%")

In [None]:
print(model.features[0][1])

In [None]:
# Print module by module
for i, m in enumerate(model.children()):
    print(i, m)

In [None]:
# Print layer by layer
for i, m in enumerate(model.modules()):
    print(i, m)