# Transfer Learning

- ImageNet: 1M labeled images in 1000 categories
- Transfer learning: Using a pre-trained network as feature detectors on images not in the training set
- Pre-trained networks can be downloaded with `torchvision.models`
- Most of pre-trained models require input image of shape 224 x 224, each color channel was normalized separately, with means = `[0.485, 0.456, 0.406]` and stds are `[0.229, 0.224, 0.225]`

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models

In [2]:
data_dir = 'Cat_Dog_data'

train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(255),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.Resize(255),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])


In [3]:
train_data = datasets.ImageFolder(data_dir + '/train', transform=train_transforms)
test_data = datasets.ImageFolder(data_dir + '/test', transform=test_transforms)


In [4]:
trainloader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(test_data, batch_size=32, shuffle=True)

In [5]:
model = models.densenet121(pretrained=True)
model

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplac

Model is built out of **2 main parts**:
- *features*: Stack of convolution layers, works as a feature detector
- *classifier*: Linear layer with 1000 out_features as 1000 classes $\leftarrow$ Replace it in our problems


In [6]:
# Freeze paramaters so we don't backprop through them
for param in model.parameters():
    param.requires_grad = False

In [7]:
from collections import OrderedDict

In [8]:
classifier = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(1024, 128)), # output shape of features is 1024,
    ('relu', nn.ReLU()),
    ('fc2', nn.Linear(128, 2)),
    ('output', nn.LogSoftmax(dim=1))
]))

model.classifier = classifier

We can train with CPU or GPU, but with a deep networks like this, GPU can be much faster than CPU. 

Move model parameters and other tensors to GPU: `model.to('cuda')`, move back to CPU: `model.to('cpu')` (when operates on model's output)

In [9]:
import time

for device in ['cpu', 'cuda']:
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)
    
    model.to(device)
    for ii, (inputs, labels) in enumerate(trainloader):
        inputs, labels = inputs.to(device), labels.to(device)
        start = time.time()
        outputs = model.forward(inputs)
        
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if ii == 10:
            break
    print(f"Device = {device}; Time per batch: {(time.time() - start)/10:.3f} seconds")

Device = cpu; Time per batch: 0.616 seconds
Device = cuda; Time per batch: 0.003 seconds


**Exercise**: Train a pretrained models to classify the cat and dog images. Continue with the DenseNet model, or try ResNet, it's also a good model to try out first. Make sure you are only training the classifier and the parameters for the features part are frozen.

In [10]:
def validation(model, testloader, criterion, device):
    accuracy = 0
    test_loss = 0
    for images, labels in testloader:
        images, labels = images.to(device), labels.to(device)
        output = model.forward(images)
        test_loss += criterion(output, labels).item()

        ps = torch.exp(output)
        equality = (labels.data == ps.max(1)[1])
        accuracy += equality.type_as(torch.FloatTensor()).mean()

    return test_loss, accuracy

In [11]:
def train(model, trainloader, testloader, criterion, optimizer, epochs=5, print_every=200, device='cuda'):
    steps = 0
    running_loss = 0
    for e in range(epochs):
        # Model in training mode, dropout is on
        model.train()
        for images, labels in trainloader:
            steps += 1
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            output = model.forward(images)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if steps % print_every == 0:
                model.eval()
                with torch.no_grad():
                    test_loss, accuracy = validation(model, testloader, criterion, device)
                
                print("Epoch: {}/{}.. ".format(e+1, epochs),
                      "Training Loss: {:.3f}.. ".format(running_loss/print_every),
                      "Test Loss: {:.3f}.. ".format(test_loss/len(testloader)),
                      "Test Accuracy: {:.3f}".format(accuracy/len(testloader)))
                
                running_loss = 0
                
                # Make sure dropout and grads are on for training
                model.train()


In [12]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)

In [16]:
train(model, trainloader, testloader, criterion, optimizer, epochs=1, print_every=200)

Epoch: 1/1..  Training Loss: 0.177..  Test Loss: 0.049..  Test Accuracy: 0.983


KeyboardInterrupt: 