In [None]:
!rm -Rf sample_data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Data

In [None]:
!cp /content/drive/MyDrive/Wipro/Final/Data/Flickr30k/*words* /content/drive/MyDrive/Wipro/Final/Data/Flickr30k/images.zip .

In [None]:
!unzip -q images.zip

In [None]:
!rm images.zip

In [None]:
!pip install timm
!pip install transformers

Collecting timm
  Downloading timm-0.4.12-py3-none-any.whl (376 kB)
[?25l[K     |▉                               | 10 kB 29.4 MB/s eta 0:00:01[K     |█▊                              | 20 kB 28.7 MB/s eta 0:00:01[K     |██▋                             | 30 kB 17.9 MB/s eta 0:00:01[K     |███▌                            | 40 kB 15.4 MB/s eta 0:00:01[K     |████▍                           | 51 kB 8.1 MB/s eta 0:00:01[K     |█████▏                          | 61 kB 8.2 MB/s eta 0:00:01[K     |██████                          | 71 kB 8.6 MB/s eta 0:00:01[K     |███████                         | 81 kB 9.0 MB/s eta 0:00:01[K     |███████▉                        | 92 kB 8.5 MB/s eta 0:00:01[K     |████████▊                       | 102 kB 9.4 MB/s eta 0:00:01[K     |█████████▋                      | 112 kB 9.4 MB/s eta 0:00:01[K     |██████████▍                     | 122 kB 9.4 MB/s eta 0:00:01[K     |███████████▎                    | 133 kB 9.4 MB/s eta 0:00:01[K    

# Model

In [None]:
import shutil
import torch
from torch.utils.data import DataLoader,Dataset
import transformers
from transformers import CLIPModel, CLIPConfig
import numpy as np
import matplotlib.pyplot as plt
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
import os 
import pandas as pd
import cv2 as cv
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import WeightedRandomSampler
from sklearn.metrics import  f1_score
from tqdm.autonotebook import tqdm

import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile


from tensorboard.plugins import projector

In [None]:
import os
len(os.listdir('flickr30k_images'))

31784

In [None]:
import numpy as np
import pandas as pd

flickr_df = pd.read_csv('flickr30k_words.csv')
flickr_df.head()

Unnamed: 0,image_name,word
0,1000092795.jpg,two
1,1000092795.jpg,young
2,1000092795.jpg,guys
3,1000092795.jpg,shaggy
4,1000092795.jpg,hair


In [None]:
flickr_df.dropna(inplace=True)
# rem_images = ['image_1567.jpg','image_4924.jpg','image_5119.png','image_6357.jpg']
# flickr_df.drop(flickr_df[flickr_df['image_name'].isin(rem_images)].index,inplace=True)

In [None]:
flickr_df.shape

(1085685, 2)

In [None]:
len(flickr_df.image_name.unique())

31783

In [None]:
# CLIPModel(CLIPConfig())

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
device

device(type='cuda', index=0)

In [None]:
# CLIP Tokeniser
tokeniser = transformers.CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

Downloading:   0%|          | 0.00/862k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.49M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [None]:
class AvgMeter:
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()

    def reset(self):
        self.avg, self.sum, self.count = [0] * 3

    def update(self, val, count=1):
        self.count += count
        self.sum += val * count
        self.avg = self.sum / self.count

    def __repr__(self):
        text = f"{self.name}: {self.avg:.4f}"
        return text

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]

In [None]:
'''
  Dataset class
'''

class FlickrData(Dataset):
    def __init__(self,root_dir,img_names,ocr_text, tokeniser,transforms=None):
        
        self.img_names = img_names
        self.ocr_text = ocr_text
        self.tokeniser = tokeniser
        self.root_dir = root_dir
        self.transforms = transforms
        
    def __len__(self):
        return len(self.img_names)
        
        
    def __getitem__(self,idx):
        
        # print(self.img_names[idx])

        img = cv.imread(os.path.join(self.root_dir,self.img_names[idx]))
        img = cv.cvtColor(img,cv.COLOR_BGR2RGB)
        
        if self.transforms:
            img = self.transforms(img)
        
        output_token_ids = self.tokeniser.encode_plus(
            self.ocr_text[idx],
            max_length=76,
            add_special_tokens=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask=True,
            truncation = True
        )
        
        return {
            'img': torch.FloatTensor(img),
            'input_ids': output_token_ids['input_ids'],
            'att_mask': output_token_ids['attention_mask'],
        }

def create_train_dataset(df,tokeniser,max_len=76):
    ds = FlickrData(
        root_dir = './flickr30k_images/',
        img_names = df['image_name'].to_list(),
        ocr_text= df['word'].to_list(),
        tokeniser = tokeniser,
        transforms = torchvision.transforms.Compose(
        [
                torchvision.transforms.ToPILImage(),
                torchvision.transforms.Resize((224,224)),
                torchvision.transforms.ToTensor(),
        ]
        )
    )
    return ds

In [None]:
'''
  To create model based on CLIP 
'''
class MemotionModel(nn.Module):
    def __init__(self,scratch=True):
        super(MemotionModel,self).__init__()
        self.pre_model = CLIPModel(CLIPConfig()).to(device)
        self.scratch = scratch
        
        if scratch:
          for params in self.pre_model.parameters():
              params.requires_grad = True
        
        else:
          for params in self.pre_model.parameters():
              params.requires_grad = False
                
    def forward(self,x,input_ids,att_mask):
        img_embed =  self.pre_model.get_image_features(x)
        text_embed = self.pre_model.get_text_features(input_ids.squeeze(1),attention_mask=att_mask.squeeze(1))
        return img_embed, text_embed
        

def calc_loss(image_embeddings, text_embeddings, temperature=1.0):
    logits = (text_embeddings @ image_embeddings.T) / temperature
    images_similarity = image_embeddings @ image_embeddings.T
    texts_similarity = text_embeddings @ text_embeddings.T
    targets = F.softmax(
        (images_similarity + texts_similarity) / 2 * temperature, dim=-1
    )
    texts_loss = cross_entropy(logits, targets, reduction='none')
    images_loss = cross_entropy(logits.T, targets.T, reduction='none')
    loss =  (images_loss + texts_loss) / 2.0 # shape: (batch_size)
    return logits, targets, loss.mean()

def cross_entropy(preds, targets, reduction='none'):
    log_softmax = nn.LogSoftmax(dim=-1)
    loss = (-targets * log_softmax(preds)).sum(1)
    if reduction == "none":
        return loss
    elif reduction == "mean":
        return loss.mean()

# Training

In [None]:
'''
  Train, Val ( 80-10 )
'''

fli_df_train, fli_df_val = train_test_split(flickr_df,test_size=0.2)
# df_val,df_test = train_test_split(df_test,test_size=0.5)

In [None]:
fli_df_train.shape, fli_df_val.shape

((868548, 2), (217137, 2))

In [None]:
# Create dataset

fli_train_dataset = create_train_dataset(fli_df_train,tokeniser)
fli_val_dataset = create_train_dataset(fli_df_val,tokeniser)

In [None]:
'''
  Loaders for training 
'''
BATCH_SIZE = 150

# train_sampler = WeightedRandomSampler(sample_weights,num_samples=len(sample_weights),replacement=True)
fli_train_loader = DataLoader(fli_train_dataset,batch_size=BATCH_SIZE)
# test_loader = DataLoader(test_dataset,batch_size=BATCH_SIZE)
fli_val_loader = DataLoader(fli_val_dataset,batch_size=BATCH_SIZE)

In [None]:
for i in fli_train_loader:
    print(i.keys())
    break

dict_keys(['img', 'input_ids', 'att_mask'])


In [None]:
def train_epoch(model, train_loader, optimizer, lr_scheduler, step):
    loss_meter = AvgMeter()
    tqdm_object = tqdm(train_loader, total=len(train_loader))
    for batch in tqdm_object:

        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        image_embeddings, text_embeddings = model(batch['img'], batch['input_ids'], batch['att_mask'])
        logits, targets, loss = calc_loss(image_embeddings, text_embeddings)

        loss.backward()
        optimizer.step()
        if step == "batch":
            lr_scheduler.step()

        count = batch["img"].size(0)
        loss_meter.update(loss.item(), count)

        tqdm_object.set_postfix(train_loss=loss_meter.avg, lr=get_lr(optimizer))
    return loss_meter


def valid_epoch(model, valid_loader):
    loss_meter = AvgMeter()
    tqdm_object = tqdm(valid_loader, total=len(valid_loader))
    for batch in tqdm_object:
        batch = {k: v.to(device) for k, v in batch.items()}
        image_embeddings, text_embeddings = model(batch['img'], batch['input_ids'], batch['att_mask'])
        logits, targets, loss = calc_loss(image_embeddings, text_embeddings)

        count = batch["img"].size(0)
        loss_meter.update(loss.item(), count)

        tqdm_object.set_postfix(valid_loss=loss_meter.avg)
    
    return loss_meter

In [None]:
head_lr = 1e-3
image_encoder_lr = 1e-4
text_encoder_lr = 1e-5
weight_decay = 1e-3
patience = 1
factor = 0.8
epochs = 5

In [None]:
import itertools

model = MemotionModel().to(device)
params = [
    {"params": model.parameters(), "lr": image_encoder_lr},
]
optimizer = torch.optim.AdamW(params, weight_decay=weight_decay)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", patience=patience, factor=factor
)
step = "epoch"

best_loss = 2.0959

In [None]:
# model.load_state_dict(torch.load('/content/drive/MyDrive/Wipro/Implementation/models/clip_scratch_flickr_words_basic_model.pt'))

In [None]:
training_loss = []
validation_loss = []
for epoch in range(epochs):
    print(f"Epoch: {epoch + 1}")
    model.train()

    train_loss = train_epoch(model, fli_train_loader, optimizer, lr_scheduler, step)
    print(f'train_loss: {train_loss}')
    model.eval()
    with torch.no_grad():
        valid_loss = valid_epoch(model, fli_val_loader)
    print(f'valid_loss: {valid_loss}')
    if valid_loss.avg < best_loss:
        best_loss = valid_loss.avg
        torch.save(model.state_dict(), "/content/drive/MyDrive/Wipro/Final/Base Models/models/clip_scratch_flickr_words_basic_model.pt")
        print("Saved Best Model!")
    
    lr_scheduler.step(valid_loss.avg)

Epoch: 1


  0%|          | 0/5791 [00:00<?, ?it/s]

train_loss: Metric: 2.6754


  0%|          | 0/1448 [00:00<?, ?it/s]

valid_loss: Metric: 2.5514
Epoch: 2


  0%|          | 0/5791 [00:00<?, ?it/s]

train_loss: Metric: 2.5406


  0%|          | 0/1448 [00:00<?, ?it/s]

valid_loss: Metric: 2.5179
Epoch: 3


  0%|          | 0/5791 [00:00<?, ?it/s]

train_loss: Metric: 2.5177


  0%|          | 0/1448 [00:00<?, ?it/s]

valid_loss: Metric: 2.5130
Epoch: 4


  0%|          | 0/5791 [00:00<?, ?it/s]

train_loss: Metric: 2.5408


  0%|          | 0/1448 [00:00<?, ?it/s]