# PyTorch Tutorial 2 - Transfer Learning

Jinhyeok Jeong 
2024-05-28

Load packages

In [1]:
import torch
from torch import nn 

import torchvision
from torchvision import datasets 
import torchvision.transforms as transforms

from torch.utils.data import DataLoader

import numpy as np 
import matplotlib.pyplot as plt 

from torchsummary import summary

## Torchsummary 

Instead of using print() function, summary() function from torchsummary package can be used to examine the detailed information of the network.

<code>
pip install torch-summary
</code>


https://pypi.org/project/torch-summary/

In [3]:
import torch
from torch import nn 

import torchvision
from torchvision import datasets 
from torchvision.transforms import ToTensor

from torch.utils.data import DataLoader

import numpy as np 
import matplotlib.pyplot as plt 

from torchsummary import summary

In [5]:
# define network
class simpleNN(nn.Module):

    def __init__(self):

        super().__init__()

        self.flatten = nn.Flatten() 
        self.Linear1 = nn.Linear(28*28, 512)
        self.relu1 = nn.ReLU()
        self.Linear2 = nn.Linear(512, 512)
        self.relu2 = nn.ReLU() 
        self.Linear3 = nn.Linear(512, 512)
        self.relu3 = nn.ReLU()
        self.Linear4 = nn.Linear(512, 10) 

    def forward(self, x):
        x = self.flatten(x) 
        x = self.relu1(self.Linear1(x))
        x = self.relu2(self.Linear2(x))
        x = self.relu3(self.Linear3(x))
        z = self.Linear4(x)
        
        return z
    
model = simpleNN()

print('print output:')
print(model)

print('\n\nsummary output:')
_ = summary(model)

print output:
simpleNN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (Linear1): Linear(in_features=784, out_features=512, bias=True)
  (relu1): ReLU()
  (Linear2): Linear(in_features=512, out_features=512, bias=True)
  (relu2): ReLU()
  (Linear3): Linear(in_features=512, out_features=512, bias=True)
  (relu3): ReLU()
  (Linear4): Linear(in_features=512, out_features=10, bias=True)
)


summary output:
Layer (type:depth-idx)                   Param #
├─Flatten: 1-1                           --
├─Linear: 1-2                            401,920
├─ReLU: 1-3                              --
├─Linear: 1-4                            262,656
├─ReLU: 1-5                              --
├─Linear: 1-6                            262,656
├─ReLU: 1-7                              --
├─Linear: 1-8                            5,130
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0


summary can get additional arguments such as the sahpe of input data and batch_dimension.



In [6]:
# if the shape of input data is provided, summary would show output shape information
# The default value of batch_dim is 0, and batch dimension should not be included in the input data shape.
_ = summary(model, (1, 28, 28))

Layer (type:depth-idx)                   Output Shape              Param #
├─Flatten: 1-1                           [-1, 784]                 --
├─Linear: 1-2                            [-1, 512]                 401,920
├─ReLU: 1-3                              [-1, 512]                 --
├─Linear: 1-4                            [-1, 512]                 262,656
├─ReLU: 1-5                              [-1, 512]                 --
├─Linear: 1-6                            [-1, 512]                 262,656
├─ReLU: 1-7                              [-1, 512]                 --
├─Linear: 1-8                            [-1, 10]                  5,130
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0
Total mult-adds (M): 0.93
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 3.56
Estimated Total Size (MB): 3.57


In [7]:
# if batch_dimension is None, input data can include batch size information.
_ = summary(model, (1000, 28, 28), batch_dim=None)

Layer (type:depth-idx)                   Output Shape              Param #
├─Flatten: 1-1                           [1000, 784]               --
├─Linear: 1-2                            [1000, 512]               401,920
├─ReLU: 1-3                              [1000, 512]               --
├─Linear: 1-4                            [1000, 512]               262,656
├─ReLU: 1-5                              [1000, 512]               --
├─Linear: 1-6                            [1000, 512]               262,656
├─ReLU: 1-7                              [1000, 512]               --
├─Linear: 1-8                            [1000, 10]                5,130
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0
Total mult-adds (M): 0.93
Input size (MB): 2.99
Forward/backward pass size (MB): 11.80
Params size (MB): 3.56
Estimated Total Size (MB): 18.34


## Load pre-trained network

torchvision package has several pre-trained models that can be used for transfer learning 

(e.g., AlexNet, ConvNext, DenseNet, EfficientNet, GoogLeNet, Inception V3, ResNet, VGG, VisionTransfomer, ....)

https://pytorch.org/vision/stable/models.html

save & load model and weights:

https://pytorch.org/tutorials/beginner/saving_loading_models.html


In [22]:
alexnet

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

In [17]:
# for some other models, there could be more than one pre-trinaed weights that are available
# CAUTION: default value of an argument 'weights' is None (no pre-training)

alexnet = torchvision.models.alexnet(weights='DEFAULT') # for alexnet, DEFAULT = IMAGENET1K_V1
# resnet18 = torchvision.models.resnet18(weights='DEFAULT') # for resnet18, DEFAULT = IMAGENET1K_V1

_ = summary(alexnet,(1,3,224,224),batch_dim=None)

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [1, 256, 6, 6]            --
|    └─Conv2d: 2-1                       [1, 64, 55, 55]           23,296
|    └─ReLU: 2-2                         [1, 64, 55, 55]           --
|    └─MaxPool2d: 2-3                    [1, 64, 27, 27]           --
|    └─Conv2d: 2-4                       [1, 192, 27, 27]          307,392
|    └─ReLU: 2-5                         [1, 192, 27, 27]          --
|    └─MaxPool2d: 2-6                    [1, 192, 13, 13]          --
|    └─Conv2d: 2-7                       [1, 384, 13, 13]          663,936
|    └─ReLU: 2-8                         [1, 384, 13, 13]          --
|    └─Conv2d: 2-9                       [1, 256, 13, 13]          884,992
|    └─ReLU: 2-10                        [1, 256, 13, 13]          --
|    └─Conv2d: 2-11                      [1, 256, 13, 13]          590,080
|    └─ReLU: 2-12                        [1, 256, 13, 13]    

there are various ways to see the layers of a model. children() and module() will return a generator making a list of layers in the network.

In [42]:
alexnet.children

<bound method Module.children of AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216

## Modify the network for transfer learning

During transfer learning, you may want to freeze the weights for the earlier layers.
Freezing weights can be done by making gradients not computed.

In [24]:
for param in alexnet.parameters():
    param.requires_grad = False

Once the pre-trained weights are freezed, final classification layer can be modified for transfer learning

In [47]:
alexnet.classifier

Sequential(
  (0): Dropout(p=0.5, inplace=False)
  (1): Linear(in_features=9216, out_features=4096, bias=True)
  (2): ReLU(inplace=True)
  (3): Dropout(p=0.5, inplace=False)
  (4): Linear(in_features=4096, out_features=4096, bias=True)
  (5): ReLU(inplace=True)
  (6): Linear(in_features=4096, out_features=1000, bias=True)
)

In [48]:
alexnet.classifier[-1]

Linear(in_features=4096, out_features=1000, bias=True)

In [49]:
alexnet.classifier[-1].in_features

4096

In [99]:
n_in_features = alexnet.classifier[-1].in_features
n_newclass = 10

alexnet.classifier[-1] = nn.Linear( n_in_features, n_newclass )

Frozen weights are indicated by parenthesis.

In [70]:
_ = summary(alexnet)

Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─Conv2d: 2-1                       (23,296)
|    └─ReLU: 2-2                         --
|    └─MaxPool2d: 2-3                    --
|    └─Conv2d: 2-4                       (307,392)
|    └─ReLU: 2-5                         --
|    └─MaxPool2d: 2-6                    --
|    └─Conv2d: 2-7                       (663,936)
|    └─ReLU: 2-8                         --
|    └─Conv2d: 2-9                       (884,992)
|    └─ReLU: 2-10                        --
|    └─Conv2d: 2-11                      (590,080)
|    └─ReLU: 2-12                        --
|    └─MaxPool2d: 2-13                   --
├─AdaptiveAvgPool2d: 1-2                 --
├─Sequential: 1-3                        --
|    └─Dropout: 2-14                     --
|    └─Linear: 2-15                      (37,752,832)
|    └─ReLU: 2-16                        --
|    └─Dropout: 2-17                     --
|    └─Linear: 2-18        

Alternative way of modifying the pre-trained model:

You may define a class and took the pre-trained network as a part of new model. If you want to build more complex model based on the pre-trained network, this might be better

In [134]:
# define network
class modified_alex(nn.Module):

    def __init__(self,n_class=10):

        super().__init__()

        alexnet = torchvision.models.alexnet(weights='DEFAULT')

        for param in alexnet.parameters():
            param.requires_grad = False

        self.features = nn.ModuleList(alexnet.children())
        n_in_features = self.features[2][-1].in_features

        self.features[2] = self.features[2][:-1] # drop out the final layer

        self.final =  nn.Linear(n_in_features,n_class)


    def forward(self, x):
        x = self.features(x) 
        z = self.final(x)
        
        return z
    
alexnet2 = modified_alex()

_ = summary(alexnet2)

Layer (type:depth-idx)                   Param #
├─ModuleList: 1-1                        --
|    └─Sequential: 2-1                   --
|    |    └─Conv2d: 3-1                  (23,296)
|    |    └─ReLU: 3-2                    --
|    |    └─MaxPool2d: 3-3               --
|    |    └─Conv2d: 3-4                  (307,392)
|    |    └─ReLU: 3-5                    --
|    |    └─MaxPool2d: 3-6               --
|    |    └─Conv2d: 3-7                  (663,936)
|    |    └─ReLU: 3-8                    --
|    |    └─Conv2d: 3-9                  (884,992)
|    |    └─ReLU: 3-10                   --
|    |    └─Conv2d: 3-11                 (590,080)
|    |    └─ReLU: 3-12                   --
|    |    └─MaxPool2d: 3-13              --
|    └─AdaptiveAvgPool2d: 2-2            --
|    └─Sequential: 2-3                   --
|    |    └─Dropout: 3-14                --
|    |    └─Linear: 3-15                 (37,752,832)
|    |    └─ReLU: 3-16                   --
|    |    └─Dropout: 3-17  

## Training & Test

Load dataset. Depending on the architecture of a network, you may want to resize the input images

In [82]:
transform = transforms.Compose(
    [transforms.Resize(size=(224,224)),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


To speed up the training, I will use the GPU

In [90]:
# use GPU if it is available. If not, CPU will be used.
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    print('GPU')
else:
    device = torch.device('cpu')
    print('CPU')

GPU


Either CPU or GPU could be used, but the network and the data need to be on the same device

In [100]:
import torch.optim as optim

alexnet = alexnet.to(device) # load the model to the GPU

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(alexnet.parameters(), lr=0.001, momentum=0.9)

n_epoch = 3

In [101]:
train_loss_epochs = []
train_acc_epochs = []

for epoch in range(n_epoch):  # loop over the dataset multiple times

    # running loss
    train_loss = 0.0
    train_acc = 0

    running_loss = 0.0

    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # load data to GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = alexnet(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # compute the number of correct predictions 
        train_acc += (outputs.argmax(1) == labels).sum().item()
        train_loss += loss.item() * inputs.size(0)

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

[1,  2000] loss: 2.049
[1,  4000] loss: 1.674
[1,  6000] loss: 1.548
[1,  8000] loss: 1.485
[1, 10000] loss: 1.479
[1, 12000] loss: 1.387
[2,  2000] loss: 1.624
[2,  4000] loss: 1.297
[2,  6000] loss: 1.327
[2,  8000] loss: 1.321
[2, 10000] loss: 1.324
[2, 12000] loss: 1.391
[3,  2000] loss: 1.556
[3,  4000] loss: 1.197
[3,  6000] loss: 1.125
[3,  8000] loss: 1.210
[3, 10000] loss: 1.217
[3, 12000] loss: 1.251
Finished Training


What if you want to use custom weights saved in a file?

In [None]:
# define network
class simpleNN(nn.Module):

    def __init__(self):

        super().__init__()

        self.flatten = nn.Flatten() 
        self.Linear1 = nn.Linear(28*28, 512)
        self.relu1 = nn.ReLU()
        self.Linear2 = nn.Linear(512, 512)
        self.relu2 = nn.ReLU() 
        self.Linear3 = nn.Linear(512, 512)
        self.relu3 = nn.ReLU()
        self.Linear4 = nn.Linear(512, 10) 

    def forward(self, x):
        x = self.flatten(x) 
        x = self.relu1(self.Linear1(x))
        x = self.relu2(self.Linear2(x))
        x = self.relu3(self.Linear3(x))
        z = self.Linear4(x)
        
        return z
    
model = simpleNN()

# save model weights. you can also save the entirle model (torch.save(model,'simple_model_with_weight.pth'))
torch.save(model.state_dict(), 'simple_model_weight.pth')

torch.save(optimizer.state_dict(), 'optimizer_weight.pth')

# load model weights
model.load_state_dict(torch.load('simple_model_weight.pth'))