In [65]:
# Imports
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from spacy.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split
import spacy
import pandas as pd
import numpy as np
import os
import re
from nltk.corpus import stopwords 
import random
from tqdm import tqdm
from PIL import Image
import math
import json
from collections import defaultdict

In [2]:
# Tokenizer using spacy
nlp = spacy.load("en_core_web_sm")
tokenizer = Tokenizer(nlp.vocab)

In [80]:
# Check accuracy function
def check_accuracy(output,labels):
    _ , predpos = output.max(1)
    num_samples=len(labels)
    num_correct=(predpos==labels).sum()
    return (num_correct/num_samples)*100

# Save checkpoint
def save_checkpoint(state,filename='weights.pth.tar'):
    print('Saving weights-->')
    torch.save(state,filename)

# Load checkpoint
def load_checkpoint(filename,model,optim):
    print('Loading weights-->')
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['state_dict'])
    optim.load_state_dict(checkpoint['optimizer'])

In [4]:
def create_dataframe(BASE_PATH,json_file,image_folder):
    path = os.path.join(BASE_PATH,"annotations/"+json_file)
    with open(path) as f:
        data = json.load(f)
        data = data['annotations']

    img_cap_pairs = []

    for sample in data:
        img_name = '%012d.jpg' % sample['image_id']
        img_cap_pairs.append([img_name, sample['caption']])

    captions = pd.DataFrame(img_cap_pairs, columns=['image', 'caption'])
    captions['image'] = captions['image'].apply(
        lambda x: f'{BASE_PATH}/{image_folder}/{x}'
    )
    captions = captions.reset_index(drop=True)
    return captions

In [5]:
train_df = create_dataframe("/kaggle/input/coco-2017-dataset/coco2017","captions_train2017.json","train2017")
val_df = create_dataframe("/kaggle/input/coco-2017-dataset/coco2017","captions_val2017.json","val2017")

In [6]:
train_df = train_df.sample(40000)
train_df = train_df.reset_index(drop=True)

In [7]:
val_df  =val_df.sample(10000)
val_df = val_df.reset_index(drop=True)

In [8]:
print(train_df.shape, val_df.shape)

(40000, 2) (10000, 2)


In [9]:
train_df.head()

Unnamed: 0,image,caption
0,/kaggle/input/coco-2017-dataset/coco2017/train...,A close up of a person holding a battered onio...
1,/kaggle/input/coco-2017-dataset/coco2017/train...,A cookie-ice cream sandwich is served with a s...
2,/kaggle/input/coco-2017-dataset/coco2017/train...,A bare bathroom with a sink and toilet.
3,/kaggle/input/coco-2017-dataset/coco2017/train...,A man in sunglasses and a blue shirt holds a m...
4,/kaggle/input/coco-2017-dataset/coco2017/train...,Two double decker buses pull away from a build...


In [10]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}


stop_words = set(stopwords.words('english'))

In [11]:
def text_cleaner(text):
    newString = text.lower()
    newString = newString.replace('"', "'")
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    tokens = [w for w in newString.split()]
    return " ".join(tokens)

In [12]:
# Clean text in train and val dataframe
train_df['caption'] = train_df['caption'].apply(lambda x: [token.text.lower() for token in tokenizer(text_cleaner(x))])
val_df['caption'] = val_df['caption'].apply(lambda x: [token.text.lower() for token in tokenizer(text_cleaner(x))])

In [13]:
# Add START AND END tokens to summary
train_df['caption'] = train_df['caption'].apply(lambda x : ['_START_']+ x + ['_END_'])
val_df['caption'] = val_df['caption'].apply(lambda x : ['_START_']+ x + ['_END_'])

In [14]:
train_df.head()

Unnamed: 0,image,caption
0,/kaggle/input/coco-2017-dataset/coco2017/train...,"[_START_, a, close, up, of, a, person, holding..."
1,/kaggle/input/coco-2017-dataset/coco2017/train...,"[_START_, a, cookie, ice, cream, sandwich, is,..."
2,/kaggle/input/coco-2017-dataset/coco2017/train...,"[_START_, a, bare, bathroom, with, a, sink, an..."
3,/kaggle/input/coco-2017-dataset/coco2017/train...,"[_START_, a, man, in, sunglasses, and, a, blue..."
4,/kaggle/input/coco-2017-dataset/coco2017/train...,"[_START_, two, double, decker, buses, pull, aw..."


In [15]:
def get_max_seqlen():
    max_length = 0
    for index, row in train_df.iterrows():
        # Calculate the length of the current row
        row_length = len(row['caption'])
        # Update the maximum length if the current row length is greater
        max_length = max(max_length, row_length)
    for index, row in val_df.iterrows():
        # Calculate the length of the current row
        row_length = len(row['caption'])
        # Update the maximum length if the current row length is greater
        max_length = max(max_length, row_length)
    print("Max length in dataset ",max_length)
    return max_length

In [16]:
# Build vocabularies - each word has an index, note : words sorted in ascending order
all_tokens = train_df['caption'].tolist() + val_df['caption'].tolist()
target_vocab = {actual_word: idx for idx, (word_num, actual_word) in enumerate(sorted(enumerate(set(token for tokens in all_tokens for token in tokens)), key=lambda x: x[1]))}

In [17]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using",device)

Using cuda


In [18]:
temp = list(sorted(target_vocab.items()))
for word, idx in temp[-5:]:
    print(word,idx)

zone 9913
zoo 9914
zookeeper 9915
zooming 9916
zucchini 9917


In [19]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, target_vocab, image_transform=None):
        self.dataframe = dataframe
        self.target_vocab = target_vocab
        self.image_transform = image_transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path=self.dataframe.loc[idx]['image']
        image=Image.open(img_path)
        caption = [self.target_vocab[word] for word in self.dataframe.loc[idx]['caption']]
        if self.image_transform:
            image = self.image_transform(image)
        if image.shape[0] != 3:
            return torch.randn((3,512,512)),torch.tensor(caption)
        return image,torch.tensor(caption)

In [20]:
transform =transforms.Compose([
    transforms.Resize((512,512)),
    transforms.ToTensor()
])

In [21]:
# Create custom datasets
train_dataset = CustomDataset(train_df, target_vocab, transform)
val_dataset = CustomDataset(val_df, target_vocab, transform)

In [22]:
print(len(train_dataset))

40000


In [23]:
# Define collate function for DataLoader
def collate_fn(batch):
    images, captions = zip(*batch)
    images = torch.stack(images,dim=0)
    padded_captions = pad_sequence(captions, batch_first=True)
    return images, padded_captions

In [24]:
# Use pretrained resnet101 as feature extractor
resnet_test = torchvision.models.resnet101(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /root/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth
100%|██████████| 171M/171M [00:01<00:00, 153MB/s]  


In [25]:
resnet_test.fc = nn.Linear(in_features = 2048, out_features = 512)

In [26]:
print(resnet_test)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [27]:
# Use pretrained resnet101 as feature extractor
def get_resnet_encoder(out_features,pretrained=True):
    resnet_encoder = torchvision.models.resnet101(pretrained=pretrained)
    # Modify this model to encode feature to embedding_dim = 512, so that the 
    # image feature encoding can be used for Cross Attention in Decoder only Transformer
    resnet_encoder.fc = nn.Linear(in_features = 2048, out_features = out_features)
    return resnet_encoder

In [28]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiHeadAttention,self).__init__()
        assert embedding_dim % num_heads == 0, "embedding_dim must be divisible by num_heads"

        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.dim_perhead = embedding_dim // num_heads

        self.W_q = nn.Linear(embedding_dim, embedding_dim)
        self.W_k = nn.Linear(embedding_dim, embedding_dim)
        self.W_v = nn.Linear(embedding_dim, embedding_dim)
        self.W_o = nn.Linear(embedding_dim, embedding_dim)

    def scaled_dot_product_attention(self,Q,K,V,mask=None):
        # Q,K,V Shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        
        K = K.transpose(-2,-1) # K = K.permute(0,1,3,2) also works
        # K Shape(after permute) : [Batch_Size X Num_Heads X Dim Per Head X Seq_len]
        attn_scores = torch.matmul(Q,K) / math.sqrt(self.dim_perhead)
        # attn_scores Shape : [Batch_Size X Num_Heads X Seq_len X Seq_len]
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        # attn_probs Shape : [Batch_Size X Num_Heads X Seq_len X Seq_len]
        output = torch.matmul(attn_probs, V)
        # output Shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        return output

    def split_heads(self, x):
        # X shape : [Batch_Size X Seq_len X Embedding Dim]
        batch_size, seq_length, d_model = x.size()
        x = x.view(batch_size, seq_length,self.num_heads,self.dim_perhead)
        # X shape : [Batch_Size X Seq_len X Num_Heads X Dim Per Head]
        x = x.transpose(1,2)
        # X shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        return x

    def combine_heads(self, x):
        # x Shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        batch_size, _, seq_length, dim_perhead = x.size()
        x = x.transpose(1,2).contiguous()
        # x Shape : [Batch_Size X Seq_len X Num_Heads X Dim Per Head]
        x = x.view(batch_size, seq_length,self.embedding_dim)
        # x Shape : [Batch_Size X Seq_len X Embedding Dim]
        return x

    def forward(self, Q, K, V, mask=None):
        # Q,K,V Shape : [Batch_Size X Seq_len X Embedding Dim]
        Q = self.split_heads(self.W_q(Q)) 
        K = self.split_heads(self.W_k(K)) 
        V = self.split_heads(self.W_v(V)) 
        # Q,K,V Shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        # attn_output Shape : [Batch_Size X Num_Heads X Seq_len X Dim Per Head]
        output = self.W_o(self.combine_heads(attn_output))
        # output Shape :  # x Shape : [Batch_Size X Seq_len X Embedding Dim]
        return output

In [29]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # shape does not change here
        return self.fc2(F.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length,dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model,2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe',pe.unsqueeze(0))

    def forward(self, x):
        # shape does not change here, adding positional encoding information
        return x + self.pe[:, :x.size(1)]

In [30]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x,tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x,enc_output,enc_output,src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [103]:
class Encoder_Decoder(nn.Module):
    def __init__(self, resnet_encoder, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, device):
        super(Encoder_Decoder, self).__init__()
        self.encoder = resnet_encoder
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        self.device = device

    def generate_mask(self, src, tgt):
        src_mask = None
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, img, caption):
        # img shape : [batch_size X 3 X 512 X 512]  ,  caption shape : [batch_size X seq_len]
        src_mask, caption_mask = self.generate_mask(img, caption)
        caption_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(caption)))
        # caption_embedded : [batch_size X seq_len X embedding_dim]
        
        enc_output = self.encoder(img) 
        # enc_output shape : [batch_size X 512]
        enc_output = enc_output.unsqueeze(1)
        # enc_output shape : [batch_size X 1 X 512]
        dec_output = caption_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, caption_mask)
        output = self.fc(dec_output)
        return output

In [32]:
tgt_vocab_size = len(target_vocab)
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = get_max_seqlen()
dropout = 0.1
num_workers = 2
num_epochs = 5

Max length in dataset  51


In [91]:
resnet_encoder = get_resnet_encoder(d_model,pretrained=True)
model = Encoder_Decoder(resnet_encoder, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout,device)
print(model)

Encoder_Decoder(
  (encoder): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
  

In [34]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(trainable_params)

78939390


In [35]:
# Specify optimizer and loss function
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0)

In [36]:
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)

In [37]:
source_dummy,target_dummy = next(iter(train_loader))

In [38]:
print(source_dummy.shape,target_dummy.shape)

torch.Size([8, 3, 512, 512]) torch.Size([8, 15])


In [39]:
print(target_dummy[3].dtype)

torch.int64


In [40]:
print(torch.min(target_dummy),torch.max(target_dummy))

tensor(0) tensor(9783)


In [41]:
model.to(device)
source_dummy = source_dummy.to(device)
target_dummy = target_dummy.to(device)
print()




In [42]:
y_pred = model(source_dummy,target_dummy)
print(y_pred.shape,target_dummy.shape)

torch.Size([8, 15, 9918]) torch.Size([8, 15])


In [43]:
y_pred = y_pred.reshape(-1,len(target_vocab))
target_dummy = target_dummy.reshape(-1)
print(y_pred.shape,target_dummy.shape)

torch.Size([120, 9918]) torch.Size([120])


In [44]:
def train_loop(model,dataloader,loss_fun,optimizer,device):
    model.train()
    model.to(device)
    min_loss = None
    for epoch in range(num_epochs):
        losses = []
        accuracies = []
        loop = tqdm(enumerate(dataloader), total=len(dataloader), leave=True)
        for batch,(x,y) in loop:
            # put on cuda
            x = x.to(device)
            y = y.to(device)
    
            # forward pass
            y_pred = model(x,y)
            
            # calculate loss & accuracy
            loss = loss_fun(y_pred.reshape(-1,len(target_vocab)),y.reshape(-1))
            losses.append(loss.detach().item())
            
            accuracy = check_accuracy(y_pred.reshape(-1,len(target_vocab)),y.reshape(-1))
            accuracies.append(accuracy.detach().item())
            
            # zero out prior gradients
            optimizer.zero_grad()
            
            # backprop
            loss.backward()
            
            # update weights
            optimizer.step()
            scheduler.step()
            
            # Update TQDM progress bar
            loop.set_description(f"Epoch [{epoch}/{num_epochs}] ")
            loop.set_postfix(loss=loss.detach().item(), accuracy=accuracy.detach().item())

        moving_loss = sum(losses) / len(losses)
        moving_accuracy = sum(accuracies) / len(accuracies)
        checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
        # Save check point
        if min_loss == None:
            min_loss = moving_loss
            save_checkpoint(checkpoint)
        elif moving_loss < min_loss:
            min_loss = moving_loss
            save_checkpoint(checkpoint)
        print('Epoch {0} : Loss = {1} , Training Accuracy={2}'.format(epoch, moving_loss, moving_accuracy))

In [None]:
train_loop(model,train_loader,criterion,optimizer,device)

Epoch [0/5] : 100%|██████████| 5000/5000 [30:59<00:00,  2.69it/s, accuracy=64.7, loss=0.525]  


Saving weights-->
Epoch 0 : Loss = 0.7552891996013 , Training Accuracy=66.54491707992554


Epoch [1/5] : 100%|██████████| 5000/5000 [30:55<00:00,  2.70it/s, accuracy=75.8, loss=0.118]   


Saving weights-->
Epoch 1 : Loss = 0.26876360664280946 , Training Accuracy=69.9179668586731


Epoch [2/5] :  89%|████████▉ | 4460/5000 [27:47<03:22,  2.66it/s, accuracy=79.5, loss=0.181]   

In [48]:
def test_loop(model,dataloader,loss_fun,device):
    model.eval()
    model.to(device)
    losses = []
    samples,correct = 0,0
    loop = tqdm(enumerate(dataloader), total=len(dataloader), leave=True)
    with torch.no_grad():
        for batch,(x,y) in loop:
            # put on cuda
            x = x.to(device)
            y = y.to(device)

            # forward pass
            y_pred = model(x,y)
            
            # caclulate test loss
            loss = loss_fun(y_pred.reshape(-1,len(target_vocab)),y.reshape(-1))
            losses.append(loss.detach().item())

            # accuracy over entire dataset
            _,predpos=y_pred.reshape(-1,len(target_vocab)).max(1)
            samples+=len(y.reshape(-1))
            correct+=(predpos==y.reshape(-1)).sum().item()
            
            # Update TQDM progress bar
            loop.set_postfix(loss=loss.item())

    print("Final Test Accuracy = ",100 * (correct/samples))

In [60]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

In [61]:
print_size_of_model(model)

Size (MB): 316.497228


In [93]:
import time

start = time.time()
test_loop(model,val_loader,criterion,device=torch.device("cuda"))
end = time.time()
print("Time Taken without Quantizing on CUDA",end - start)

100%|██████████| 1250/1250 [02:35<00:00,  8.04it/s, loss=0.228]   


Final Test Accuracy =  69.67381612839311
Time Taken without Quantizing on CUDA 155.81649684906006


## Exploring Quantization on CPU, create model on CPU and load saved weights

In [99]:
# Create small val set for CPU
val_df = val_df.sample(500)
val_df = val_df.reset_index(drop=True)
val_dataset = CustomDataset(val_df, target_vocab, transform)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)

In [104]:
resnet_encoder = get_resnet_encoder(d_model,pretrained=True)
model = Encoder_Decoder(resnet_encoder, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout,torch.device("cpu"))

In [105]:
load_checkpoint("/kaggle/working/weights.pth.tar",model,optimizer)

Loading weights-->


In [107]:
# Check time for 500 samples on CPU without quantiztion
start = time.time()
test_loop(model,val_loader,criterion,device=torch.device("cpu"))
end = time.time()
print("Time Taken without Quantizing on CPU for 500 samples",end - start)

100%|██████████| 63/63 [04:16<00:00,  4.06s/it, loss=0.0192]  

Final Test Accuracy =  69.92303872889771
Time Taken without Quantizing on CPU for 500 samples 256.63870787620544





### Dynamic Quantization or Post Training Quantization

In [113]:
# Exploring quantization
from copy import deepcopy
quantized_model = deepcopy(model).to("cpu")
quantized_model = torch.quantization.quantize_dynamic(quantized_model, {torch.nn.Linear, torch.nn.Conv2d, torch.nn.ReLU, torch.nn.BatchNorm2d}, dtype=torch.qint8)

In [114]:
print_size_of_model(quantized_model) # Reduced size of the model

Size (MB): 222.667296


In [115]:
# Check time for 500 samples on CPU with quantiztion
start = time.time()
test_loop(quantized_model,val_loader,criterion,device=torch.device("cpu"))
end = time.time()
print("Time Taken with Quantizing ",end - start)

100%|██████████| 63/63 [04:01<00:00,  3.83s/it, loss=0.0189]  

Final Test Accuracy =  69.92303872889771
Time Taken with Quantizing  241.28281617164612



