In [12]:
from datasets import dog_dataset, cub_dataset, food_dataset, cub_and_dogs
from models.models_to_finetune import deit_small_patch16_224, myresnetv2_task1, myresnetv2_task2, myresnetv2_for_c_loss
import PIL
import numpy as np
from tqdm import tqdm
import torch
import torch.optim as optim
from torchvision import transforms
import config
import sys
import math
from loss import CenterLoss
from run_center_loss import train_model_with_closs
from vit.vit_pytorch.nest import NesT
import timm
from torch.optim.lr_scheduler import ReduceLROnPlateau
from models import bilinear_model
from run import train_model


All of the models can be trained using the two **main.py** files in the submission folder. These notebook contain sufficient code to run inference on the selected models for the sake of clarity.

In [2]:
epochs = 50
batch_size = 32
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
test_transform=transforms.Compose([
                    transforms.Resize((448, 448)),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
                ])

data_transform4 = transforms.Compose([  #

        transforms.Resize((448, 448)),
        transforms.RandomRotation(20),
        transforms.GaussianBlur(3, sigma=(0.1, 2.0)),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std)
    ])

*In task1 resnetv2-448 gave the best results for CNN-based model, while cait_xxs_24_384 gave the best results for transformer-based model on CUB-dataset. In this task we will try use these models as baseline and run different experiments by changing the loss functions as well as the architecture.* Training longer increased accuracy for each experiment setting; for a fair comparison we will compare accuracy achieved within 30 epochs. \
**Kindly check the excel sheet provided in the submission to look through all the experiments done for task 2.**

## Center Loss
In addition to cross entropy loss used for classification, we also tried to incorporate center loss into the objective function. The reasoning behind this was as there is less inter-class variation among the classes, center-loss would try to separate the classes in the feature space for easier classification. **The implementation of the loss function is in loss.py file.**
Here we are showing the results of center loss on resnetv2-448 using cub and dogs dataset. For comparison , we are only showing models trained without any data augmentation (just resize and normalisation) for the sake of clarity.

In [15]:
# implementation of dataloaders is done in dataset.py file.
data_transform = transforms.Compose([  #

        transforms.Resize((448, 448)),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std)
    ])
train_loader, val_loader, test_loader = cub_and_dogs(bs=batch_size, data_transform=data_transform, test_transform=test_transform)

In [16]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model = myresnetv2_for_c_loss(num_classes=320)
model = model.to(device)

for param in model.parameters():
    param.requires_grad = True

my_list = ['head.1.weight', 'head.1.bias','head.3.weight', 'head.3.bias']
params = list(filter(lambda kv: kv[0] in my_list, model.named_parameters()))
base_params = list(filter(lambda kv: kv[0] not in my_list, model.named_parameters()))

crit_entr = torch.nn.CrossEntropyLoss()
crit_closs = CenterLoss(num_classes=320, feat_dim=512)
path = "/home/hashmat.malik/Fall 2021/CV703 Lab/Week5/datasets/Task2:cub_and_dogs/Exp1/model_best_resnet_v2_cubs_dogs_0.pth.tar"
checkpoint = torch.load(path)
model.load_state_dict(checkpoint['state_dict'])
optimizer = optim.Adam([{'params':  [i[1]for i in params], 'lr': 0.0001, 'betas': (0.5, 0.999)},
                {'params':  [i[1]for i in base_params], 'lr': 0.00001, 'betas': (0.5, 0.999)},
                {'params': crit_closs.parameters(), 'lr': 0.01, 'betas': (0.5, 0.999)}
                        ])

scheduler = ReduceLROnPlateau(optimizer, 'max')
train_model_with_closs(30, train_loader, val_loader, test_loader, optimizer, scheduler, crit_entr, crit_closs, model, f'resnet_v2_closs_new_lr_{0.01}', is_train=False)


Test: [ 0/57]	Time  0.670 ( 0.670)	ent_Loss 4.6718e-01 (4.6718e-01)	center_loss 2.7889e+03 (2.7889e+03)	loss 8.8340e+00 (8.8340e+00)	Acc@1  90.62 ( 90.62)	Acc@5  96.88 ( 96.88)
Test: [ 5/57]	Time  0.307 ( 0.392)	ent_Loss 6.0692e-01 (5.8485e-01)	center_loss 2.6108e+03 (2.6260e+03)	loss 8.4393e+00 (8.4629e+00)	Acc@1  87.50 ( 85.94)	Acc@5  96.88 ( 96.35)
Test: [10/57]	Time  0.344 ( 0.371)	ent_Loss 8.7803e-01 (6.5627e-01)	center_loss 2.6980e+03 (2.6443e+03)	loss 8.9719e+00 (8.5892e+00)	Acc@1  71.88 ( 83.81)	Acc@5 100.00 ( 97.16)
Test: [15/57]	Time  0.351 ( 0.369)	ent_Loss 5.8840e-01 (6.5883e-01)	center_loss 2.7748e+03 (2.6576e+03)	loss 8.9129e+00 (8.6316e+00)	Acc@1  84.38 ( 83.01)	Acc@5  93.75 ( 97.27)
Test: [20/57]	Time  0.341 ( 0.362)	ent_Loss 7.4801e-01 (6.5629e-01)	center_loss 2.5609e+03 (2.6474e+03)	loss 8.4308e+00 (8.5986e+00)	Acc@1  68.75 ( 82.59)	Acc@5  96.88 ( 97.32)
Test: [25/57]	Time  0.318 ( 0.355)	ent_Loss 7.6398e-01 (6.8024e-01)	center_loss 2.5907e+03 (2.6607e+03)	loss 8.5360

Test: [170/450]	Time  0.248 ( 0.263)	ent_Loss 4.7232e-01 (7.3439e-01)	center_loss 2.3768e+03 (2.3916e+03)	loss 7.6028e+00 (7.9092e+00)	Acc@1  81.25 ( 79.61)	Acc@5 100.00 ( 96.45)
Test: [175/450]	Time  0.260 ( 0.263)	ent_Loss 2.1642e-01 (7.3039e-01)	center_loss 2.0155e+03 (2.3846e+03)	loss 6.2629e+00 (7.8841e+00)	Acc@1  93.75 ( 79.72)	Acc@5 100.00 ( 96.47)
Test: [180/450]	Time  0.253 ( 0.264)	ent_Loss 1.1116e+00 (7.3373e-01)	center_loss 2.3581e+03 (2.3814e+03)	loss 8.1859e+00 (7.8779e+00)	Acc@1  65.62 ( 79.49)	Acc@5  93.75 ( 96.50)
Test: [185/450]	Time  0.261 ( 0.264)	ent_Loss 6.2038e-01 (7.3389e-01)	center_loss 2.3651e+03 (2.3823e+03)	loss 7.7157e+00 (7.8808e+00)	Acc@1  75.00 ( 79.42)	Acc@5 100.00 ( 96.52)
Test: [190/450]	Time  0.249 ( 0.264)	ent_Loss 8.7020e-01 (7.3582e-01)	center_loss 2.2315e+03 (2.3791e+03)	loss 7.5648e+00 (7.8731e+00)	Acc@1  68.75 ( 79.32)	Acc@5 100.00 ( 96.55)
Test: [195/450]	Time  0.254 ( 0.264)	ent_Loss 2.4061e-01 (7.3138e-01)	center_loss 2.5247e+03 (2.3819e+03)

Test: [400/450]	Time  0.284 ( 0.274)	ent_Loss 2.6336e-01 (7.1310e-01)	center_loss 3.6607e+03 (2.7380e+03)	loss 1.1245e+01 (8.9270e+00)	Acc@1  90.62 ( 79.90)	Acc@5 100.00 ( 96.76)
Test: [405/450]	Time  0.295 ( 0.275)	ent_Loss 6.1409e-01 (7.0981e-01)	center_loss 3.8956e+03 (2.7496e+03)	loss 1.2301e+01 (8.9585e+00)	Acc@1  81.25 ( 80.02)	Acc@5 100.00 ( 96.77)
Test: [410/450]	Time  0.271 ( 0.275)	ent_Loss 7.1481e-02 (7.0846e-01)	center_loss 3.3494e+03 (2.7605e+03)	loss 1.0120e+01 (8.9900e+00)	Acc@1 100.00 ( 80.05)	Acc@5 100.00 ( 96.79)
Test: [415/450]	Time  0.250 ( 0.275)	ent_Loss 4.5042e-01 (7.0464e-01)	center_loss 3.3429e+03 (2.7712e+03)	loss 1.0479e+01 (9.0182e+00)	Acc@1  84.38 ( 80.21)	Acc@5  96.88 ( 96.81)
Test: [420/450]	Time  0.306 ( 0.275)	ent_Loss 3.4262e-01 (7.0257e-01)	center_loss 3.7493e+03 (2.7848e+03)	loss 1.1590e+01 (9.0570e+00)	Acc@1  90.62 ( 80.31)	Acc@5 100.00 ( 96.79)
Test: [425/450]	Time  0.312 ( 0.276)	ent_Loss 9.1621e-01 (7.0454e-01)	center_loss 3.9368e+03 (2.7957e+03)

Reached top1 validation accuracy of $82.06\%$ and top1 test accuracy of $81.15\%$. This is still less than test accuracy of resnetv2-448 (82.09%) only trained using the cross-entropy loss. 

## Fusion Model
In this experiment we tried to combine the trasformer model (in our case cait_xxs_24_384) with a CNN model (in our case resnetv2). **The implementation of the model is in the models folder (bilinear_model.py)**. We extract features from the the third block of resnet backbone. For transformers, we combine all the patch token coming out of the encoder to form a feature map. Then we combine both of the feature maps using a transfusion module block in hope that it will be able to integrate the learned features from both of the backbones and give a more powerful feature representation.

For comparison , we are only showing models trained without any data augmentation (just resize and normalisation) for the sake of clarity. Training on larger resolution images gives a boost to the accuracy, as can be seen in using resnetv2 with $448\times 448$ image size. However to reduce computation and for a fair comparison with the Fusion model which uses $384 \times 384$ for both backbones, we will compare it with the accuracy of resnetv2_384 (top1 test Accuracy $81.61 \%$) and cait_384 (top1 test Accuracy $79.89 \%$).

In [8]:
test_transform=transforms.Compose([
                    transforms.Resize((384, 384)),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
                ])

data_transform = transforms.Compose([ 

        transforms.Resize((384, 384)),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std)
    ])

train_loader, val_loader, test_loader = cub_and_dogs(bs=batch_size, data_transform=data_transform, test_transform=test_transform)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Here both backbones on pretrained on ImageNet. The bacbones are frozen, only the transfusion block and final layers are trained.

In [10]:
model = bilinear_model.TransFuse_S(num_classes=320, pretrained=True)
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss()
path = "/home/hashmat.malik/Fall 2021/CV703 Lab/Week5/datasets/modelresnetv2_fusion_4_best.pth.tar"
checkpoint = torch.load(path)
model.load_state_dict(checkpoint['state_dict'])
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.5, 0.999))


In [14]:
train_model(30, train_loader, val_loader, test_loader, optimizer, criterion, model, f'resnetv2_fusion_{1}', is_train=False)

Test: [ 0/57]	Time  1.020 ( 1.020)	Loss 7.4345e-01 (7.4345e-01)	Acc@1  75.00 ( 75.00)	Acc@5  96.88 ( 96.88)
Test: [ 5/57]	Time  0.452 ( 0.559)	Loss 5.1658e-01 (6.4955e-01)	Acc@1  81.25 ( 78.12)	Acc@5 100.00 ( 97.92)
Test: [10/57]	Time  0.452 ( 0.513)	Loss 8.8987e-01 (7.5851e-01)	Acc@1  71.88 ( 76.42)	Acc@5 100.00 ( 97.44)
Test: [15/57]	Time  0.451 ( 0.496)	Loss 9.9967e-01 (8.0272e-01)	Acc@1  68.75 ( 75.98)	Acc@5  90.62 ( 96.48)
Test: [20/57]	Time  0.454 ( 0.487)	Loss 6.0313e-01 (7.6953e-01)	Acc@1  84.38 ( 77.23)	Acc@5  93.75 ( 96.28)
Test: [25/57]	Time  0.453 ( 0.481)	Loss 5.9471e-01 (7.4037e-01)	Acc@1  75.00 ( 78.12)	Acc@5 100.00 ( 96.63)
Test: [30/57]	Time  0.454 ( 0.478)	Loss 1.1169e+00 (7.4522e-01)	Acc@1  71.88 ( 77.92)	Acc@5  96.88 ( 96.67)
Test: [35/57]	Time  0.455 ( 0.476)	Loss 8.5409e-01 (7.1897e-01)	Acc@1  75.00 ( 78.99)	Acc@5  93.75 ( 96.88)
Test: [40/57]	Time  0.453 ( 0.474)	Loss 9.0299e-01 (7.1328e-01)	Acc@1  71.88 ( 78.81)	Acc@5 100.00 ( 97.18)
Test: [45/57]	Time  0.455 ( 

Test: [315/450]	Time  0.465 ( 0.471)	Loss 6.9104e-01 (6.5879e-01)	Acc@1  78.12 ( 80.96)	Acc@5  93.75 ( 97.48)
Test: [320/450]	Time  0.467 ( 0.471)	Loss 1.8913e+00 (6.7221e-01)	Acc@1  40.62 ( 80.56)	Acc@5  93.75 ( 97.34)
Test: [325/450]	Time  0.466 ( 0.471)	Loss 1.1613e+00 (6.8110e-01)	Acc@1  65.62 ( 80.24)	Acc@5  93.75 ( 97.25)
Test: [330/450]	Time  0.464 ( 0.471)	Loss 1.7752e+00 (6.8543e-01)	Acc@1  40.62 ( 80.07)	Acc@5  87.50 ( 97.22)
Test: [335/450]	Time  0.468 ( 0.471)	Loss 9.4993e-01 (6.8660e-01)	Acc@1  75.00 ( 80.01)	Acc@5  93.75 ( 97.19)
Test: [340/450]	Time  0.465 ( 0.471)	Loss 8.6385e-01 (6.8553e-01)	Acc@1  71.88 ( 80.09)	Acc@5  93.75 ( 97.19)
Test: [345/450]	Time  0.466 ( 0.471)	Loss 1.0148e+00 (6.8698e-01)	Acc@1  78.12 ( 80.05)	Acc@5  87.50 ( 97.14)
Test: [350/450]	Time  0.470 ( 0.471)	Loss 1.8649e-01 (6.8754e-01)	Acc@1  93.75 ( 80.07)	Acc@5 100.00 ( 97.08)
Test: [355/450]	Time  0.469 ( 0.471)	Loss 1.1940e+00 (6.9152e-01)	Acc@1  78.12 ( 80.00)	Acc@5  90.62 ( 97.02)
Test: [360

Reached top1 validation accuracy of $78.28\%$ and top1 test accuracy of $77.153\%$. This is less than the individual accuracy of both resnetv2-384 as well as cait_384. We also tried to fine tune the whole by unfreezing the backbones. However, it led to decrease in the accuracy which we believe is because loss of pretrained learned representation. Results can be seen in the excel sheet. While data augmentation increased the accuracy, it still was less than the accuracy achieved by resnetv2 and cait separately.