# Deep Learning Mini-project
### Analysis of ResNet-style Architectures on CIFAR-10 Using lesser than Five-million Parameters

<br>
Anudeep Tubati, Ashwin Guptha, Aditya Shyamsundar

<a href="https://colab.research.google.com/github/NeuralFlux/dl-mini-project/blob/main/resnet_method5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Useful Imports

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision.datasets import CIFAR10

import copy
import time


### Data augmentation and batching

In [None]:
sample_data = CIFAR10('./', download=True)


means = sample_data.data.mean(axis=(0, 1, 2)) / 255
stds = sample_data.data.std(axis=(0, 1, 2)) / 255

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 44624347.41it/s]


Extracting ./cifar-10-python.tar.gz to ./


In [None]:
# Creating the transforms to augment the dataset
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.RandomRotation((-90,90)),
    transforms.RandomApply(  
        [
            transforms.ColorJitter(brightness=(0.5, 1.5), contrast=(0.5, 1.5),
                                   saturation=(0.5, 1.5), hue=(-0.5, 0.5)),
        ],
        p=0.5
    ),
    transforms.ToTensor(),
    transforms.Normalize(means, stds)
  ])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(means, stds)
])

In [None]:
train_data = CIFAR10('./', download=True, transform=transform_train)
test_data = CIFAR10('./', train=False, download=True,
                             transform=transform_test)
     

Files already downloaded and verified
Files already downloaded and verified


In [None]:
TRAIN_RATIO = 0.9
num_train_samples = int(len(train_data) * TRAIN_RATIO)
num_valid_samples = len(train_data) - num_train_samples
split = [num_train_samples, num_valid_samples]

train_data, valid_data = data.random_split(train_data, lengths=split)

In [None]:
valid_data = copy.deepcopy(valid_data)
valid_data.dataset.transforms = transform_train
     

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 256

train_iterator = data.DataLoader(train_data, BATCH_SIZE, shuffle=True)
valid_iterator = data.DataLoader(valid_data, BATCH_SIZE)
test_iterator = data.DataLoader(test_data, BATCH_SIZE)
     

In [None]:
dataloaders = {"train": train_iterator, "val": valid_iterator}
dataset_sizes = {"train": len(train_data.indices),
                 'val': len(valid_data.indices)}

### Modified ResNet

In [None]:
import torch
import torch.nn as nn


def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


In [None]:
class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        #og kernel_size=7,stride=2 change to 3, stride = 1
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=5, stride=1, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        #og stride=2 change to stride = 1 for MaXPoolLayer and all make_layers
        self.maxpool = nn.MaxPool2d(kernel_size=5, stride=1)
        self.layer1 = self._make_layer(block, 32, layers[0])
        self.layer2 = self._make_layer(block, 64, layers[1], stride=1)
        self.layer3 = self._make_layer(block, 128, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 256, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def _forward_impl(self, x):
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

    def forward(self, x):
        return self._forward_impl(x)

In [None]:
def _resnet(block, layers, **kwargs):
    model = ResNet(block, layers, **kwargs)
    return model

### Training loop

In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(DEVICE)
                labels = labels.to(DEVICE)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item()
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            if phase == 'train':  # take scheduler step on train acc
                scheduler.step(epoch_acc)

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
num_epochs = 25
learning_rate = 0.1

model = _resnet(BasicBlock, [2, 3, 4, 2]).to(DEVICE)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.001, momentum = 0.9) 
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       factor=0.1,
                                                       patience=10,
                                                       verbose=True) 

'\nscheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,milestones=[100, 150])\nscheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,\n                                                       factor=0.1,\n                                                       patience=5,\n                                                       verbose=True)\n                                                    \n#scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)\n'

In [None]:
train_model(model, criterion, optimizer, scheduler, num_epochs=num_epochs)

Epoch 1/25
----------
train Loss: 0.0080 Acc: 0.2855
val Loss: 0.0072 Acc: 0.3514

Epoch 2/25
----------
train Loss: 0.0060 Acc: 0.4430
val Loss: 0.0057 Acc: 0.4884

Epoch 3/25
----------
train Loss: 0.0050 Acc: 0.5430
val Loss: 0.0065 Acc: 0.4706

Epoch 4/25
----------
train Loss: 0.0045 Acc: 0.5976
val Loss: 0.0054 Acc: 0.5416

Epoch 5/25
----------
train Loss: 0.0042 Acc: 0.6256
val Loss: 0.0058 Acc: 0.4914

Epoch 6/25
----------
train Loss: 0.0038 Acc: 0.6626
val Loss: 0.0049 Acc: 0.5858

Epoch 7/25
----------
train Loss: 0.0035 Acc: 0.6876
val Loss: 0.0042 Acc: 0.6510

Epoch 8/25
----------
train Loss: 0.0032 Acc: 0.7159
val Loss: 0.0039 Acc: 0.6656

Epoch 9/25
----------
train Loss: 0.0031 Acc: 0.7300
val Loss: 0.0054 Acc: 0.5494

Epoch 10/25
----------
train Loss: 0.0030 Acc: 0.7397
val Loss: 0.0046 Acc: 0.6160

Epoch 11/25
----------
train Loss: 0.0029 Acc: 0.7443
val Loss: 0.0042 Acc: 0.6496

Epoch 12/25
----------
Epoch 00012: reducing learning rate of group 0 to 1.0000e-02.


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(5, 5), stride=(1, 1), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=5, stride=1, padding=0, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): BasicBlock(
      (conv1)

### Evaluation

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_iterator:
        images = images.to(DEVICE)
        labels = labels.to(DEVICE)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy on the test images: {} %'.format(100 * correct / total)) 

Accuracy on the test images: 90.05 %


### Number of Parameters

In [None]:
sum(p.numel() for p in model.parameters())

3732200