# Training AlexNet with tips and checks on how to train CNNs: Practical CNNs in PyTorch(1)

https://medium.com/@kushajreal/training-alexnet-with-tips-and-checks-on-how-to-train-cnns-practical-cnns-in-pytorch-1-61daa679c74a

In [None]:
imagesFolder = '/root/Visualiz_Zeiler/ImageNet/'

import copy
import time
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision.utils import make_grid
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms

%load_ext autoreload
%autoreload 2
%matplotlib inline

plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%mkdir -p QtLogs/src
%cp -f * QtLogs/src/

## Step 1 Create Data Pipeline

My Data dirs explained. For my train dataset I use the validation dataset provided by Imagenet i.e. 50000 images. All these images are stored under the train folder. One preprocessing step that I done is to rescale all these images, so that their shorter side is 256. To do this you can either add a transform.Resize(256) or from the terminal run this command and all their images would be rescaled to 256 and stored on disk
```(python)
    find . -name "*.JPEG" | xargs -I {} convert {} -resize "256^>" {}
```

For my validation data, I use 10 images from each class from the train dataset of Imagenet. Below I give the script I used to do so. These images are also rescaled to 256 on the shorter side using the above command. 

### 1.1 Create the dataloaders

For the transforms we first need to crop our images, as the input size for our model is 224. For the training dataset I use RandomHorizontalFlip as a data augmentation technique. Another technique that is useful is the FiveCrop transform. But I did not use that here. The images are normalized using the standard values of mean and std computed over the entire ImageNet.

For the validation I did not use data augmentation.

In [None]:
train_dir = imagesFolder + '/train'
val_dir = imagesFolder + '/test'

size = 224
batch_size = 128
num_workers = 20

data_transforms = {
    'train': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
}

image_datasets = {
    'train': ImageFolder(train_dir, transform=data_transforms['train']),
    'val': ImageFolder(val_dir, transform=data_transforms['val']),
}

data_loader = {
    x: torch.utils.data.DataLoader(image_datasets[x],
                                   batch_size=batch_size,
                                   shuffle=True,
                                   num_workers=num_workers) for x in ['train', 'val']
}

### 1.2 Plot some images to test everything is working

In [None]:
# For imagenet we need to do some preprocessing for the labels
# f = open("../../../Data/LOC_synset_mapping.txt", "r")
f = open(imagesFolder + '/ILSVRC2012_map_clsloc.txt', 'r')
labels_dict = {}
labels_list = []
for line in f:
    split = line.split(' ', maxsplit=1)
    split[1] = split[1][:-1]
    label_id, label = split[0], split[1]
    labels_dict[label_id] = label
    labels_list.append(split[1])
   
print('Labels dict:-')
for idx, (key, value) in enumerate(labels_dict.items()):
    print(key, value)
    if (idx > 3):
        break
        
print('\nLabels list (just 0 indexed instead of file names)')
labels_list[:4]

In [None]:
# Our images are normalized so denormalize then and convert them to numpy
def imshow(img, title=None):
    img = img.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img = std*img + mean
    img = np.clip(img, 0, 1)
    plt.imshow(img)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)

if 0:
    selectedImages = []
    selectedLabels = []
    it = iter(data_loader['train'])
    while len(selectedImages) < 10:
        images, labels = next(it)
        for i, l in enumerate(labels):
    #         print(l)
            if l == 0:
                selectedImages.append(images[i])
                selectedLabels.append(l)
    #             print(len(selectedImages))

    images, labels = next(iter(data_loader['train']))
    grid_img = make_grid(selectedImages[:10], nrow=5)
    plt.figure(figsize=(15, 8), dpi=150)
    imshow(grid_img)
    print ([labels_list[x] for x in selectedLabels[:10]])

## Step 2 Model Construction

1. Activation function:- ReLU is the default choice. But LeakyReLU is also good. Use LeakyReLU in GANs always.
2. Weight Initialization:- Use He initialization as default with ReLU. 
3. Preprocess data:- There are two choices normalizing between [-1,1] or using (x-mean)/std approch. Your choice for this.
4. Batch Normalization:- Apply before non-linearity i.e. ReLU. For the values of the mean and variance use the running average of the values while training as test time. PyTorch automatically maintains this for you. Note: In a recent review paper for ICLR 2019, FixUp initialization was introduced. Using it, you don't need batchnorm layers in your model.
5. Pooling layers:- Apply after non-linearity i.e. ReLU. Different tasks would require different pooling methods for classification max-pool is default.
6. Optimizer:- Adam is a good choice, SDG+momentum+nesterov is also good. fast.ai recently announced a new opitimizer AdamW. Choice of optiimzer comes to experimentation and the task at hand. Look for some benchmarks for different optimizers that can guide your choice.

In [None]:
class AlexNet(nn.Module):
    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.conv_base = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=2, bias=False),
            nn.BatchNorm2d(96),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.fc_base = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256*6*6, 4096),
            nn.ReLU(inplace=True),
            
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            
            nn.Linear(4096, num_classes),
        )
        
    def forward(self, x):
        x = self.conv_base(x)
        x = x.view(x.size(0), 256*6*6)
        x = self.fc_base(x)
        return x
    
model = AlexNet()
model

### 2.1 Weight Initialization

In [None]:
# Here we use indexing to index the layers, but in future cases we would name the layers
conv_list = [0, 4, 8, 10, 12]
fc_list = [1, 4, 6]
for i in conv_list:
    torch.nn.init.kaiming_normal_(model.conv_base[i].weight)
for i in fc_list:
    torch.nn.init.kaiming_normal_(model.fc_base[i].weight)

### 2.2 Create opimizer, learning_rate scheduler, loss function

In [None]:
# We train everything on GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Cross entropy loss takes the logits directly, so we don't need to apply softmax in our CNN
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0005)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True)

model = model.to(device)

In [None]:
# Reinitialize weights
conv_list = [0, 4, 8, 10, 12]
fc_list = [1, 4, 6]
for i in conv_list:
    torch.nn.init.kaiming_normal_(model.conv_base[i].weight)
for i in fc_list:
    torch.nn.init.kaiming_normal_(model.fc_base[i].weight)

## Create the train loop

In [None]:
def printProgress(str):
    with open('QtLogs/progress.log', 'a') as file:
        file.write(str + '\n')
        
# A simple train loop that you can use. You can seperate different train and val functions also.
def train(model, data_loader, criterion, optimizer, scheduler, num_epochs=25):
    reportFreq = 100

    since = time.time()
    
    train_batch_loss = []
    train_epoch_loss = []
    val_epoch_loss = []
    blockNum = 0
    valLossInfo = 'val. loss 0, val. acc 0'
    for epoch in range(num_epochs):
#         print('Epoch {}/{}'.format(epoch+1, num_epochs))
#         print('-'*15)
        
        # You perform validation test after every epoch
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            blockLoss = 0
            blockCorrects = 0
            running_loss = 0.0
            running_corrects = 0
            for idx, (inputs, labels) in enumerate(data_loader[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                # zero accumulated gradients
                optimizer.zero_grad()
                
                # During train phase we want to remember history for grads
                # and during val we do not want history of grads
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    
                    _, preds = torch.max(outputs, 1)
                    
#                     print(loss.item(), inputs.size(0))
#                     print(preds)
#                     print('labels.data ', labels.data)
                    blockLoss += loss.item()
                    blockCorrects += torch.sum(preds == labels.data)
                    if idx % reportFreq == reportFreq - 1:
                        train_batch_loss.append(loss.item())
                        if phase == 'train':
                            blockNum += 1
                            printProgress('Epoch %d: loss %.7g, acc %.6f, %s ' \
                                          '(actual epoch: %d)' % \
                                  (blockNum,
                                   blockLoss / reportFreq, 
                                   float(blockCorrects) / reportFreq / inputs.size(0),
                                   valLossInfo, epoch + 1))
                            print('Epoch %d: %d/%d steps, loss %.7g, acc %.5f' % \
                                  (epoch + 1, idx + 1, len(data_loader[phase]),
                                   blockLoss / reportFreq, 
                                   float(blockCorrects) / reportFreq / inputs.size(0)))
#                             valLossInfo = 'val. loss 0, val. acc 0'
                        blockLoss = 0
                        blockCorrects = 0
#                         break
                        
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_loss = running_loss / len(data_loader[phase].dataset)
            epoch_acc = running_corrects.double() / len(data_loader[phase].dataset)
            
            print('Epoch {} {} Loss: {:.5f} Acc: {:.5f}'.format(
                    epoch + 1, phase, epoch_loss, epoch_acc))
            
            if phase == 'val':
                val_epoch_loss.append((epoch_loss, epoch_acc))
                valLossInfo = 'val. loss %.7g, val. acc %.6f' % (epoch_loss, epoch_acc)
                if scheduler:
                    scheduler.step(loss.item())
            else:
                train_epoch_loss.append((epoch_loss, epoch_acc))
                
#         print()
        
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    
    return model


# Actual training

In [None]:
class MyData(list):
    pass

data = {'train': MyData(), 'val': MyData()}
for phase in ['train', 'val']:
    it = iter(data_loader[phase])
    for _ in range(201):
#         print(next(it))
        data[phase].append(next(it))
    data[phase].dataset = range(201 * batch_size)
    assert batch_size == len(data[phase][0][0])

optimizer = optim.Adam(model.parameters(), lr=5e-5, weight_decay=0.0005)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True,
#                                                  factor=0.2, threshold=1e-6, patience=30)
scheduler = None
train(model, data_loader, criterion, optimizer, scheduler, num_epochs=200)
# train(model, data, criterion, optimizer, scheduler, num_epochs=2000)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0005)
train(model, data_loader, criterion, optimizer, scheduler, num_epochs=10)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=0.0005)
train(model, data_loader, criterion, optimizer, scheduler, num_epochs=10)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-6, weight_decay=0.0005)
train(model, data_loader, criterion, optimizer, scheduler, num_epochs=10)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-7, weight_decay=0.0005)
train(model, data_loader, criterion, optimizer, scheduler, num_epochs=20)