# Image Classification using advanced CNNs
- we are going to use CIFAR-10 [dataset](https://www.cs.toronto.edu/~kriz/cifar.html) of 32x32 color images

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.backends import cudnn
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import time
import cupy as cp
import numpy as np

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})

model_args = {}
# we will use batch size of 64 in Stochastic Gradient Descent (SGD) optimization of the network
model_args['batch_size'] = 64
# learning rate is how fast it will descend
model_args['lr'] = .05
# SGD momentum (default: .5) momentum is a moving average of gradients (it helps to keep direction)
model_args['momentum'] = .5
# the number of epochs is the number of times you go through the full dataset
model_args['weight_decay'] = 5.e-4
model_args['epochs'] = 30

print(torch.cuda.is_available())

### Download CIFAR-10 dataset and define models
Similar as with MNIST we use torchvision to download the data

In [None]:
!rm -r ./data
# normalize dataset
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
#mean = [.5, .5, .5]
#std = [.5, .5, .5]
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize(mean, std)])

cifar10_train = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)

tensored_cifar10 = []

for pic in cifar10_train:
     picture = pic[0].to("cuda:0")
     true_val = torch.tensor(pic[1], device= "cuda:0")
     pic_tensored = (picture, true_val)
     tensored_cifar10.append(pic_tensored)

torch.save(tensored_cifar10, 'tensored_cifar10.pth')

# we divide this data into training and validation subsets
train_subset, validation_subset = torch.utils.data.random_split(tensored_cifar10, [40000, 10000])
torch.save(train_subset, 'train_subset.pt')
torch.save(validation_subset, 'validation_subset.pt')


train_subset = torch.load('train_subset.pt', map_location=torch.device('cuda:0'))

validation_subset = torch.load('validation_subset.pt', map_location=torch.device('cuda:0'))
test_subset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_cifar10 = []
for pic in test_subset:
     picture = pic[0].to("cuda:0")
     true_val = torch.tensor(pic[1], device= "cuda:0")
     pic_tensored = (picture, true_val)
     test_cifar10.append(pic_tensored)
torch.save(test_cifar10, 'test_cifar10.pth')

test_cifar10 = torch.load('test_cifar10.pth', map_location=torch.device('cuda:0'))
# subsample to speedup training (colab has notebook lifetime limit)
train_subset = torch.utils.data.Subset(train_subset, range(20000))
validation_subset = torch.utils.data.Subset(validation_subset, range(5000))
test_cifar10 = torch.utils.data.Subset(test_cifar10, range(5000))


In [None]:
train_subset = torch.load('train_subset.pt', map_location=torch.device('cuda:0'))

validation_subset = torch.load('validation_subset.pt', map_location=torch.device('cuda:0'))
test_cifar10 = torch.load('test_cifar10.pth', map_location=torch.device('cuda:0'))
# subsample to speedup training (colab has notebook lifetime limit)
train_subset = torch.utils.data.Subset(train_subset, range(20000))
validation_subset = torch.utils.data.Subset(validation_subset, range(5000))
test_subset = torch.utils.data.Subset(test_cifar10, range(5000))

In [None]:
# define dataloaders
print(type(train_subset))
# train_subset = torch.utils.data.TensorDataset(train_subset)

train_loader = []
train_data = []
rowx = []
rowy = []
i = 0
for (x, y) in train_subset:
    if i % model_args['batch_size'] == model_args['batch_size']-1:
        train_data.append([torch.stack(rowx, dim =0).to("cuda:0"), torch.tensor(rowy, device="cuda:0")])
        rowx = []
        rowy = []
    else:
        rowx.append(x)
        rowy.append(y)
    i += 1
print(1)
print(len(train_data))
print(train_data[0])




validation_loader = []
rowx = []
rowy = []
i = 0
for x, y in validation_subset:
    if i % model_args['batch_size'] == model_args['batch_size'] - 1:
        validation_loader.append([torch.stack(rowx, dim =0).to("cuda:0"), torch.tensor(rowy, device="cuda:0")])
        rowx = []
        rowy = []
    else:
        rowx.append(x)
        rowy.append(y)
    i += 1
print(1)
print(len(validation_loader))
print(validation_loader[0])



test_loader = []
rowx = []
rowy = []
i = 0
for x, y in test_subset:
    if i % model_args['batch_size'] == model_args['batch_size'] - 1:
        
        test_loader.append([torch.stack(rowx, dim =0).to("cuda:0"), torch.tensor(rowy, device="cuda:0")])
        rowx = []
        rowy = []
    else:
        rowx.append(x)
        rowy.append(y)
    i += 1
print(1)
print(len(test_loader))
print(test_loader[0])


classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
class SimpleCNN(nn.Module):
    '''
    simple CNN model
    '''
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 80)
        self.fc3 = nn.Linear(80, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class VGG(nn.Module):
    '''
    VGG model
    '''
    def __init__(self, features):
        super(VGG, self).__init__()
        self.features = features
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(512, 512),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(512, 512),
            nn.ReLU(True),
            nn.Linear(512, 10),
        )
        # Initialize weights
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, np.sqrt(2. / n))
                m.bias.data.zero_()

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

def make_layers(cfg, batch_norm=False):
    layers = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)

vgg_cfg = {
    'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
    'vgg22': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 256, 'M', 512, 512, 512, 512, 512, 'M', 512, 512, 512, 512, 512, 'M']
}

In [None]:
def train(model, device, train_data, optimizer, criterion, epoch_number):
    model.train()
    train_loss = torch.tensor(0., device=device)
    # get subsequent batches over the data in a given epoch
    for batch_idx, (data, target) in enumerate(train_data):
        # send data tensors to GPU (or CPU)
        
        data, target = data.to(device), target.to(device)
        # this will zero out the gradients for this batch
        optimizer.zero_grad()
        # this will execute the forward() function
        output = model(data)
        # calculate loss using c
        loss = criterion(output, target)
        # backpropagate the loss
        loss.backward()
        # update the model weights (with assumed learning rate)
        optimizer.step()
        train_loss += loss.item()
    print('Train Epoch: {}'.format(epoch_number))
    train_loss_len = torch.tensor(len(train_data), device=device)
    train_loss /= train_loss_len
    print('\tTrain set: Average loss: {:.4f}'.format(train_loss))
    return train_loss

def test(model, device, test_data, criterion, message=None):
    model.eval()
    test_loss = torch.tensor(0., device=device)
    correct = torch.tensor(0., device=device)
    # this is just inference, we don't need to calculate gradients
    with torch.no_grad():
        for data, target in test_data:
            data, target = data.to(device), target.to(device)
            output = model(data)
            # calculate and sum up batch loss
            test_loss += criterion(output, target).cuda()
            # get the index of class with the max probability
            prediction = output.argmax(dim=1)
            #_, predicted = torch.max(outputs.data, axis=1)
            # item() returns value of the given tensor
            correct += prediction.eq(target).sum().item()
    test_loss /= len(test_data)
    accuracy = correct / len(test_data)
    if message is not None:
        print('\t{}: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
            message, test_loss, correct, len(test_data)*model_args['batch_size'], 100.*accuracy/model_args['batch_size']))
    return test_loss.cpu(), accuracy.cpu()

def run_training(model, criterion, optimizer, no_epochs, device):
    train_loss = torch.tensor([0.], device=device)
    validation_loss = torch.tensor([0.], device=device)
    validation_accuracy = torch.tensor([0.], device=device)
    test_accuracy = torch.tensor([0.], device=device)
    for epoch_number in range(1, no_epochs+1):
        train_loss = torch.cat((train_loss, torch.tensor([train(model, device, train_data, optimizer, criterion, epoch_number)], device=device)))
        
        val_loss, val_acc = test(model, device, validation_loader,
                                criterion, 'Validation set')
        
        validation_loss = torch.cat((validation_loss, torch.tensor([val_loss], device=device)))

        validation_accuracy = torch.cat((validation_accuracy, torch.tensor([val_acc], device=device)))
        
        # we also collect test accuracies for every epoch
        _, test_acc = test(model, device, test_loader, criterion)
        test_accuracy = torch.cat((test_accuracy, torch.tensor([test_acc], device=device)))

    # and select test accuracy for the best epoch (with the highest validation accuracy)
    train_loss = train_loss[1:]
    validation_loss = validation_loss[1:]
    validation_accuracy = validation_accuracy[1:]
    test_accuracy = test_accuracy[1:]
    index = torch.argmax(validation_accuracy)
    
    best_accuracy = test_accuracy[index]
    return train_loss, validation_loss, best_accuracy

def plot_loss(train_loss, validation_loss, title):
    plt.grid(True)
    plt.xlabel("subsequent epochs")
    plt.ylabel('average loss')
    plt.plot(range(1, len(train_loss)+1), train_loss, 'o-', label='training')
    plt.plot(range(1, len(validation_loss)+1), validation_loss, 'o-', label='validation')
    plt.legend()
    plt.title(title)
    plt.show()

## Tasks to do:

1. Check turning off the data normalization -- how this impacts network training.
2. What happens if we put *weight_decay* = 0. and increase *momentum* to .9 for VGG11 model -- why is that? (hint: observe interplay between *learning rate* and *momentum*)
3. Try to explain why the deeper VGG16 network trains longer than VGG11.
4. Compare with performance for deeper VGGs: 19 or 22 -- do we observe saturation in accuracy or even *degradation* problem?
5. Does ResNet18 (with similar depth) perform better, what for ResNet34? What about training time in this case?
6. Does transfer learning speed up training?

1. Changing data normalization | weight decay

In [None]:
torch.backends.cudnn.benchmark = True
cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model.parameters(),
                      lr=model_args['lr'],
                      momentum=model_args['momentum'],
                      weight_decay=0.)
no_epochs = model_args['epochs']

train_loss, val_loss, best_accuracy = run_training(model, criterion, optimizer, no_epochs)

print('\nTest accuracy for best epoch: {:.0f}%\n'.format(100.*best_accuracy))
plot_loss(train_loss, val_loss, 'SimpleCNN model')

Without normalization result is worse by 4%.

2. VGG test

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VGG(make_layers(vgg_cfg['vgg11'])).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),
                      lr=model_args['lr'],
                      momentum=.9,
                      weight_decay=0.)
no_epochs = model_args['epochs']

train_loss_11, val_loss_11, best_accuracy = run_training(model, criterion, optimizer, no_epochs)

print('\nTest accuracy for best epoch: {:.0f}%\n'.format(100.*best_accuracy))
plot_loss(train_loss_11, val_loss_11, 'VGG11 model')

Divergence of loss instead of convergence because of high learning rate.

3. The deeper model usually needs more time to train because of more weights need tuning and loss value is lost going deeper through model

4. VGG19

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VGG(make_layers(vgg_cfg['vgg19'])).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),
                      lr=model_args['lr'],
                      momentum=model_args['momentum'],
                      weight_decay=1.e-3)

no_epochs = model_args['epochs']*2

train_loss_19, val_loss_19, best_accuracy = run_training(model, criterion, optimizer, no_epochs)

print('\nTest accuracy for best epoch: {:.0f}%\n'.format(100.*best_accuracy))
plot_loss(train_loss_19, val_loss_19, 'VGG19 model')

In [None]:
start = time.time()
torch.backends.cudnn.benchmark = True
cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = VGG(make_layers(vgg_cfg['vgg22'])).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model.parameters(),
                      lr=model_args['lr'] * 0.1,
                      momentum=model_args['momentum'],
                      weight_decay=1.e-3)

no_epochs = torch.tensor(model_args['epochs']*2, device=device)

train_loss_22, val_loss_22, best_accuracy = run_training(model, criterion, optimizer, no_epochs, device)

print('\nTest accuracy for best epoch: {:.0f}%\n'.format(100.*best_accuracy/model_args['batch_size']))
end = time.time()

print("Time needed: {:.0f}s".format(end - start))
plot_loss(train_loss_22.cpu(), val_loss_22.cpu(), 'VGG22 model')

5. ResNet

In [None]:
start = time.time()
torch.backends.cudnn.benchmark = True
cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ResNet34().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model.parameters(),
                      lr=model_args['lr'],
                      momentum=model_args['momentum'],
                      weight_decay=1.e-3)

no_epochs = torch.tensor(model_args['epochs'], device=device)

train_loss_resnet34, validation_loss_resnet34, best_accuracy_resnet34 = run_training(model, criterion, optimizer, no_epochs, device)

print('\nTest accuracy for best epoch: {:.0f}%\n'.format(100.*best_accuracy_resnet34/model_args['batch_size']))
end = time.time()

print("Time needed: {:.0f}s".format(end - start))
plot_loss(train_loss_resnet34.cpu(), validation_loss_resnet34.cpu(), 'ResNet34 model')

ResNet with changes in its structure

In [None]:
start = time.time()
torch.backends.cudnn.benchmark = True
cudnn.benchmark = True
model = resnet.resnet34(pretrained=True)
print(model)
model.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
model.bn1 = nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
model.relu = nn.Identity()  # only input layers
model.maxpool = nn.Identity()  # only in input layers
model.fc = nn.Linear(512, 10)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model.parameters(),
                      lr=model_args['lr'],
                      momentum=model_args['momentum'],
                      weight_decay=1.e-3)

no_epochs = torch.tensor(model_args['epochs'], device=device)

train_loss_resnet43_pre, validation_loss_resnet43_pre, best_accuracy_resnet43_pre = run_training(model, criterion, optimizer, no_epochs, device)

print('\nTest accuracy for best epoch: {:.0f}%\n'.format(100.*best_accuracy_resnet43_pre/model_args['batch_size']))
end = time.time()

print("Time needed: {:.0f}s".format(end - start))
plot_loss(train_loss_resnet43_pre.cpu(), validation_loss_resnet43_pre.cpu(), 'ResNet18 pretrained model')