# Transfer Learning
The idea behind transfer learning is quite simple. You use a network which performed well on task A and use it also for task B. This works well when the input for both tasks is similiar. If the input is not similiar you can still use the network from task A as a starting point but need to adjust some of the weights to make them fit task B.

## How does it work?

In [5]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torchvision

# You take a pretrained model
model = torchvision.models.resnet18(pretrained=True) # all models are pretrained on the imageNet dataset

# You freeze most of the layers
for param in model.parameters():
    param.requires_grad = False
    

# You replace for example the last layer
model.fc = nn.Linear(512, 100)


# That's it. Now you can train your last layer with your new training data

In [12]:
for name, param in model.named_parameters():
    if param.requires_grad != True:
        print(name, param.data)

conv1.weight 
(0 ,0 ,.,.) = 
 -1.0419e-02 -6.1356e-03 -1.8098e-03  ...   5.6615e-02  1.7083e-02 -1.2694e-02
  1.1083e-02  9.5276e-03 -1.0993e-01  ...  -2.7124e-01 -1.2907e-01  3.7424e-03
 -6.9434e-03  5.9089e-02  2.9548e-01  ...   5.1972e-01  2.5632e-01  6.3573e-02
                 ...                   ⋱                   ...                
 -2.7535e-02  1.6045e-02  7.2595e-02  ...  -3.3285e-01 -4.2058e-01 -2.5781e-01
  3.0613e-02  4.0960e-02  6.2850e-02  ...   4.1384e-01  3.9359e-01  1.6606e-01
 -1.3736e-02 -3.6746e-03 -2.4084e-02  ...  -1.5070e-01 -8.2230e-02 -5.7828e-03

(0 ,1 ,.,.) = 
 -1.1397e-02 -2.6619e-02 -3.4641e-02  ...   3.2521e-02  6.6221e-04 -2.5743e-02
  4.5687e-02  3.3603e-02 -1.0453e-01  ...  -3.1253e-01 -1.6051e-01 -1.2826e-03
 -8.3730e-04  9.8420e-02  4.0210e-01  ...   7.0789e-01  3.6887e-01  1.2455e-01
                 ...                   ⋱                   ...                
 -5.5926e-02 -5.2239e-03  2.7081e-02  ...  -4.6178e-01 -5.7080e-01 -3.6552e-01
  3.286

layer3.1.conv1.weight 
( 0 , 0 ,.,.) = 
  4.8367e-02  4.8045e-02  3.8471e-02
  4.9888e-02  5.5208e-02  5.6701e-02
  2.4192e-02  1.3436e-02  2.4655e-02

( 0 , 1 ,.,.) = 
 -3.6542e-03 -3.1100e-03  4.9227e-03
 -1.2114e-03  3.4020e-03  1.9846e-02
 -2.1704e-02 -2.1158e-02 -2.8686e-03

( 0 , 2 ,.,.) = 
 -1.2536e-02 -2.0486e-02 -2.3154e-02
 -1.3515e-02 -2.3781e-02 -2.5515e-02
  1.0584e-02  7.2999e-03 -5.2329e-03
    ... 

( 0 ,253,.,.) = 
 -4.3596e-02 -1.8328e-02 -5.0577e-02
  1.6590e-02  5.0719e-02  2.1919e-02
 -1.9203e-02 -8.8315e-03 -2.0335e-02

( 0 ,254,.,.) = 
 -7.6949e-03 -1.5848e-02  1.5841e-03
 -6.2470e-03 -1.3135e-02  6.9092e-03
 -3.3791e-03  1.7889e-03  3.7373e-03

( 0 ,255,.,.) = 
 -6.6310e-03  5.8503e-03 -5.8571e-04
 -2.4600e-02 -8.9747e-03 -7.2466e-03
 -1.7566e-02 -8.5829e-03 -7.5220e-03
      ⋮  

( 1 , 0 ,.,.) = 
 -2.3679e-02 -9.4399e-03 -1.1688e-02
 -2.4777e-02 -1.7326e-02 -3.1489e-02
 -3.3683e-03  9.7571e-03 -5.1527e-03

( 1 , 1 ,.,.) = 
 -3.0809e-02 -4.0685e-02 -2.2731e-02
 

layer4.0.conv2.weight 
( 0 , 0 ,.,.) = 
  1.6218e-04 -1.4720e-02 -1.7000e-02
 -1.2850e-02 -3.3085e-02 -3.6656e-02
  2.7812e-02  1.7691e-02 -1.8369e-02

( 0 , 1 ,.,.) = 
  1.0528e-02  3.1379e-02  2.4801e-02
 -1.2698e-02 -2.9453e-02 -1.1834e-02
 -9.4094e-03 -8.9462e-03 -3.1349e-02

( 0 , 2 ,.,.) = 
 -7.8447e-03 -2.9256e-02  5.3590e-03
 -1.3791e-02 -1.1116e-02  5.0388e-03
 -2.4919e-03  7.3514e-03  5.4013e-03
    ... 

( 0 ,509,.,.) = 
 -1.0276e-03 -1.0275e-02 -2.9986e-02
 -3.8465e-03  1.9549e-03 -1.6291e-02
 -1.8100e-03  8.3778e-03 -8.5481e-03

( 0 ,510,.,.) = 
 -1.8196e-02 -1.3533e-02 -1.7457e-02
  2.2457e-02  5.7402e-02  1.9325e-02
 -2.4977e-02 -3.2113e-02 -8.1780e-03

( 0 ,511,.,.) = 
  3.6550e-03  4.9358e-03 -5.7597e-03
 -1.6875e-02  1.3999e-04  3.7629e-04
 -2.6272e-03  1.0947e-03  1.1145e-03
      ⋮  

( 1 , 0 ,.,.) = 
  1.4018e-02  3.9198e-03 -1.7189e-03
 -1.3175e-03  4.3503e-04 -1.1798e-02
 -9.8003e-03 -1.7693e-02 -1.9910e-02

( 1 , 1 ,.,.) = 
 -1.4957e-02 -1.9796e-02 -2.8724e-02
 

## Let's use transfer learning for real data

In [14]:
import os
import torchvision.models as models
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms
from torch.autograd import Variable


NUM_WORKERS = 1
LR = 1e-3

data_folder = "data/dogscats"

traindir = os.path.join(data_folder, 'train')
testdir = os.path.join(data_folder, 'valid')


normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

train_dataset = datasets.ImageFolder(traindir,
                         transforms.Compose([
                             transforms.RandomResizedCrop(224),
                             transforms.RandomHorizontalFlip(),
                             transforms.ToTensor(),
                             normalize,
                         ]))

train_loader = data.DataLoader(train_dataset,batch_size=50,shuffle=True,num_workers=NUM_WORKERS)

test_dataset = datasets.ImageFolder(testdir,
                         transforms.Compose([
                             transforms.RandomResizedCrop(224),
                             transforms.RandomHorizontalFlip(),
                             transforms.ToTensor(),
                             normalize,
                         ]))

test_loader = data.DataLoader(test_dataset,batch_size=50,shuffle=True,num_workers=NUM_WORKERS)

FileNotFoundError: [Errno 2] No such file or directory: 'data/dogscats/train'

In [13]:
plt.imshow(test_dataset[0])

NameError: name 'test_dataset' is not defined

In [None]:
# Definition here: https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
model = models.resnet50(pretrained=True).cuda()

# Don't train the normal layers
for param in model.parameters():
    param.requires_grad = False


# Create a new output layer
model.fc = nn.Linear(2048, 2) # New layers has requires_grad = True by default
model.cuda() # Input and model need to be on either CPU or GPU

In [None]:
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)

def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data.cuda()), Variable(target.cuda())
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output.cuda(), target)
        loss.backward()
        optimizer.step()
        if batch_idx % 50*10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(train_loader.dataset),
            100. * batch_idx / len(train_loader), loss.data[0]))
            
def test():
    model.eval()
    test_loss = 0
    correct = 0
    for batch_idx, (data, target) in enumerate(test_loader):
        data, target = Variable(data.cuda(), volatile=True), Variable(target.cuda())
        output = model(data)
        test_loss += F.cross_entropy(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).sum()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [None]:
for epoch in range(1, 2):
    train(epoch)
    print("Running test...")
test()

In [None]:
test_dataset[i][0].view(-1,3,224,224).shape

In [None]:

%matplotlib inline

fig = plt.figure(figsize=(10,10))
def show(img, gt, pred):
    npimg = img.numpy()
    fig.add_subplot(2,5,i+1)
    plt.title("P: " + str(pred) + "  T: " + str(gt))
    plt.imshow(np.transpose(npimg, (1,2,0)), interpolation='nearest')

    
for i in range(10):
    img = test_dataset[i][0]
    gt = test_dataset[i][1]
    output = model(torch.autograd.Variable(img.view(-1,3,224,224)).cuda())
    pred = output.data.max(1)[1]
    #print(pred)
    #pred = history[i]
    img = (1/(2*2.25)) * img + 0.5
    show(img.cpu(), gt, pred.cpu().numpy()[0])