In [None]:
import random
random.seed(42)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime as dt

import torch
from torch import optim, nn
from torch.utils.data import DataLoader, TensorDataset, Dataset, random_split
from torchvision.utils import make_grid
from torchvision import transforms as T
from torchvision import models, datasets
from random import randint

from tqdm import tqdm
import os

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
device

In [None]:

train_data = datasets.CIFAR100('./', train=True, download=True)

# Stick all the images together to form a 1600000 X 32 X 3 array
x = np.concatenate([np.asarray(train_data[i][0]) for i in range(len(train_data))])

# calculate the mean and std along the (0, 1) axes
mean = np.mean(x, axis=(0, 1))/255
std = np.std(x, axis=(0, 1))/255
# the the mean and std
mean=mean.tolist()
std=std.tolist()

In [None]:
def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()
    
def show_batch(dataloader):
    dataiter = iter(dataloader)
    images, labels = next(dataiter)    
    imshow(make_grid(images)) # Using Torchvision.utils make_grid function
    
def show_image(dataloader):
    dataiter = iter(dataloader)
    images, labels = next(dataiter)
    random_num = randint(0, len(images)-1)
    imshow(images[random_num])
    label = labels[random_num]
    print(f'Label: {label}, Shape: {images[random_num].shape}')

In [None]:
# Transformation - optional depending on future resnet implementation

# Define transformation sequence for image pre-processing
# If not using pre-trained model, normalize with 0.5, 0.5, 0.5 (mean and SD)
# If using pre-trained ImageNet, normalize with mean=[0.485, 0.456, 0.406], 
# std=[0.229, 0.224, 0.225])

train_transform = T.Compose([
    T.RandomCrop(32, padding=4),
    T.RandomHorizontalFlip(),
    T.RandomRotation(15),
    T.ToTensor(),
    T.Normalize(mean, std)
])

test_transform = T.Compose([
                # T.Resize(256), # Resize images to 256 x 256
                # T.CenterCrop(224), # Center crop image
                # T.RandomHorizontalFlip(),
                T.ToTensor(),  # Converting cropped images to tensors
                T.Normalize(mean, std)
])


In [None]:
batch_size = 128
trainset = datasets.CIFAR100("./",
                                         train=True,
                                         download=True,
                                         transform=train_transform)
train_loader = torch.utils.data.DataLoader(
    trainset, batch_size, shuffle=True, num_workers=2,pin_memory=True)

testset = datasets.CIFAR100("./",
                                        train=False,
                                        download=True,
                                        transform=test_transform)
test_loader = torch.utils.data.DataLoader(
    testset, batch_size*2,pin_memory=True, num_workers=2)

In [None]:
show_image(train_loader)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18(num_classes=10):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)

def ResNet50(num_classes=10):
    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes)

def ResNet101(num_classes=10):
    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes)


def test():
    net = ResNet18(100)
    y = net(torch.randn(1, 3, 32, 32))
    print(y.size())

In [None]:
len(test_loader.dataset)

In [None]:
dataloaders = {
    'train': train_loader,
    'val': test_loader,
    'test': test_loader
}
dataset_sizes = {
    'train': len(train_loader.dataset),
    'val': len(test_loader.dataset),
    'test': len(test_loader.dataset),
}

In [None]:
def loss_fn_kd(outputs, labels, teacher_outputs, **kwargs):
    """
    Compute the knowledge-distillation (KD) loss given outputs, labels.
    "Hyperparameters": temperature and alpha
    NOTE: the KL Divergence for PyTorch comparing the softmaxs of teacher
    and student expects the input tensor to be log probabilities! See Issue #2
    """
    teacher_outputs = teacher_outputs.double()
    # print(teacher_outputs.dtype)
    alpha = kwargs['alpha']
    T = kwargs['temperature']
    KD_loss = F.cross_entropy(outputs/T, teacher_outputs) * (alpha * T * T) + \
              F.cross_entropy(outputs, labels) * (1. - alpha)

    return KD_loss

In [None]:
soft_probabilities = torch.tensor(torch.load('./resnet_5_layers_ae.pt'))
soft_probabilities = soft_probabilities.to(device)
soft_probabilities = F.softmax(torch.tensor(soft_probabilities), dim = 1)
print(soft_probabilities.shape)
print(soft_probabilities.type())

# print(soft_probabilities[0])

In [None]:
print(soft_probabilities[0])

In [None]:
loss_fn_kd(
    torch.randn(128,200),
    torch.randn(128,200),
    torch.randn(128,200),
    alpha = 0.65,
    temperature = 1
    # {'alpha': 0.65, 'temperature': 1}
)

In [None]:
import time
import copy

def train_model(model, criterion_kd, optimizer, scheduler, alpha, num_epochs=25, temp=1):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(1, num_epochs+1):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in tqdm(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)
                teacher_outputs = soft_probabilities[labels]

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    outputs = outputs.double()
                    # print(outputs.type())
                    _, preds = torch.max(outputs, 1)
                    loss = criterion_kd(outputs, labels, teacher_outputs, alpha=alpha, temperature=temp)
                    # print(loss.type())

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                print('Saving..')
                state = {
                    'model': model.state_dict(),
                    'acc': epoch_acc,
                    'epoch': epoch,
                }
                # if not os.path.isdir('checkpoint'):
                #     os.mkdir('checkpoint')
                torch.save(state, './resnet_18_ckpt_distilled_autoenc.pth')
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
import time
import copy

def accuracyAtk(output, target, topk=(1,5)):
    """
    Computes the accuracy over the k top predictions for the specified values of k
    In top-5 accuracy you give yourself credit for having the right answer
    if the right answer appears in your top five guesses.
    """
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = (pred == target.unsqueeze(dim=0)).expand_as(pred)

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k)
        return res

def eval_model(model):
    since = time.time()

    model.eval()

    final_top_1, final_top_5 = 0,0

    for inputs, labels in tqdm(dataloaders['test']):
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            top1, top5 = accuracyAtk(outputs, labels)
            final_top_1 += top1
            final_top_5 += top5
    
    top1_acc = final_top_1/dataset_sizes['test']
    top5_acc = final_top_5/dataset_sizes['test']
    print(f'Top 1 test accuracy = {top1_acc}, Top 5 test accuract = {top5_acc}')

In [None]:
# eval_model(model_ft)

In [None]:
from torch.optim import lr_scheduler
for t in [20, 10, 5, 1]:
    for i in range(1, 10, 1):
        model_ft = ResNet18(num_classes=100)
        model_ft = model_ft.to(device)

        criterion = loss_fn_kd
        optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
        exp_lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer_ft, milestones=[60, 120, 160], gamma=0.2)
        model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, i/10, num_epochs=200, temp=t)
        eval_model(model_ft)
        del model_ft