In [None]:
!pip install pycm

Collecting pycm
[?25l  Downloading https://files.pythonhosted.org/packages/76/c4/a05b90819dcf7e50bdddb030238a771bf87b0695e6c47a247f66ab7ed5d0/pycm-3.1-py2.py3-none-any.whl (63kB)
[K     |████████████████████████████████| 71kB 4.0MB/s 
[?25hCollecting art>=1.8
[?25l  Downloading https://files.pythonhosted.org/packages/83/37/c28d69ba1005889e8ff9633b8e7eccaaddc03549058a51ef5f51853cf183/art-5.1-py2.py3-none-any.whl (567kB)
[K     |████████████████████████████████| 573kB 17.3MB/s 
Installing collected packages: art, pycm
Successfully installed art-5.1 pycm-3.1


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |▏                               | 10kB 22.4MB/s eta 0:00:01[K     |▎                               | 20kB 32.2MB/s eta 0:00:01[K     |▌                               | 30kB 39.7MB/s eta 0:00:01[K     |▋                               | 40kB 34.9MB/s eta 0:00:01[K     |▉                               | 51kB 13.7MB/s eta 0:00:01[K     |█                               | 61kB 12.8MB/s eta 0:00:01[K     |█▏                              | 71kB 13.0MB/s eta 0:00:01[K     |█▎                              | 81kB 14.0MB/s eta 0:00:01[K     |█▌                              | 92kB 14.5MB/s eta 0:00:01[K     |█▋                              | 102kB 10.7MB/s eta 0:00:01[K     |█▉                              | 112kB 10.7MB/s eta 0:00:01[K     |██                              | 

In [None]:
from torchvision import models
from pycm import *
from transformers import BertTokenizer, BertModel
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import pickle
import sys
from glob import glob  
import math
import shutil
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data.dataset
import torch.utils.data.dataloader
import torchvision.transforms as visionTransforms
import PIL.Image as Image
from torchvision.transforms import ToTensor,ToPILImage

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df=pd.read_csv("path_to_add/MIDAS_Task3/Datasets/Task3_Cleaned_Multimodal.csv",index_col=0)
df['Image Path']="path_to_add/MIDAS_Task3/"+df['Image Path'] 

In [None]:
dfTrain,dfVal,dfTest=np.split(df.sample(frac=1, random_state=42), [int(.8 * len(df)), int(.9 * len(df))])
dfTrain=dfTrain.reset_index(drop=True)
dfTest=dfTest.reset_index(drop=True)
dfVal=dfVal.reset_index(drop=True)

As visualized in MIDAS_Task3_Preprocessing_Visualization.ipynb there is quite a bit of imbalance in the dataset. We decided to go with the PyTorch WeightedRandomSampler which instead of picking samples randomly in the dataloader will pick it according to weights. And the weight assigned to each class is inversely proportional to the frequency of the class. So the classes with a higher freq are less likely to get sampled, where as classes with a lower frequency are more likely to get sampled

In [None]:
from torch.utils.data import WeightedRandomSampler
freqLabels=torch.tensor(dfTrain['Label'].value_counts().sort_index(),dtype=torch.double)
weightClass=freqLabels/freqLabels.sum()
weightClass= 1/weightClass
weightClass=(weightClass).tolist()
sampleWeights=[weightClass[i] for i in dfTrain['Label']]
trainSampler=WeightedRandomSampler(sampleWeights,len(dfTrain))

We use a combination of PreTrained BERT, CNNs and Transformer Encoder Blocks for the Text Modality and PreTrained VGG-13(With 3 tunable layers) for the Vision Modality as mentioned in the MIDAS_Task3_Detailed_Analysis.pdf

---


We override the PyTorch Dataset class. FlipkartDataset Class gets the product-desc from the dataframe and the respective label. The encoder_plus function from the Transformer's Library is used to encode the given text seq, truncate the seq if it crosses the max_length arg and pad the seq if it is less than the max_length arg. The encode_plus function is also responsible for providing the attention_masks for each seq which helps BERT identify the tokens which are eligible for attention.

The thing that we tried over here was getting the embedding directly from the DataLoader i.e. having pre-trained BERT in the Dataset class itself. But training performance was extremely slow and would often lead to CUDA OOM errors.

In [None]:
from torch.utils.data import Dataset, DataLoader

class FlipkartDataset(Dataset):

  def __init__(self,dataframe,preTrainedBert,bertTokenizer,maxLength,vision_transform):
    self.data=dataframe
    self.bertTokenizer=bertTokenizer
    self.model=preTrainedBert 
    self.maxLength=maxLength
    self.vision_transform=vision_transform
  
  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    self.imgPath=str(self.data.iloc[idx,4])
    self.productDescription=str(self.data.iloc[idx,0])
    self.label=self.data.iloc[idx,3]

    self.image=Image.open(self.imgPath)
    self.image=self.vision_transform(self.image)

    self.encodedInput=self.bertTokenizer.encode_plus(text=self.productDescription,padding='max_length',truncation="longest_first",max_length=self.maxLength,return_tensors='pt',return_attention_mask=True,return_token_type_ids=True)
    self.embedding=self.model(input_ids=self.encodedInput['input_ids'],attention_mask=self.encodedInput['attention_mask'],token_type_ids=self.encodedInput['token_type_ids']).last_hidden_state

    return self.image,self.embedding,self.label

In [None]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
preprocess = torchvision.transforms.Compose([
  torchvision.transforms.Resize((224,224)),
  torchvision.transforms.ToTensor(),
  torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
  ])

We experimented with a couple of maxLength values(128,256,512). But with 256 and 512, there was a considerable increase in the training time. One other reason to choose maxLength=128 was ~70% of the data has seq length of 0-50. Hampering the training performance just to accommodate the complete seq for 30% of the data seemed like an expensive trade-off. Hence the maxLength was taken as 128

In [None]:
flipkartTrainDataset=FlipkartDataset(dataframe=dfTrain,preTrainedBert=model,bertTokenizer=tokenizer,maxLength=512,vision_transform=preprocess)
flipkartTestDataset=FlipkartDataset(dataframe=dfTest,preTrainedBert=model,bertTokenizer=tokenizer,maxLength=512,vision_transform=preprocess)
flipkartValDataset=FlipkartDataset(dataframe=dfVal,preTrainedBert=model,bertTokenizer=tokenizer,maxLength=512,vision_transform=preprocess)

The Weighted Sampler is just used with the Train Set and not the Test and Validation sets because the model should be capable of recognizing any class at test time irrespective of the frequency of the class

In [None]:
trainLoader=torch.utils.data.DataLoader(flipkartTrainDataset,batch_size=8,sampler=trainSampler)
testLoader=torch.utils.data.DataLoader(flipkartTestDataset,batch_size=8,shuffle=True)
valLoader=torch.utils.data.DataLoader(flipkartValDataset,batch_size=8,shuffle=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
class MultiModalNetwork(nn.Module):
  def __init__(self,preTrainedVGG,textInChannels=1,embeddingDimension=768,numHeads=8,numEncoderLayers=3,numClasses=27):
    super(MultiModalNetwork,self).__init__()
    self.textInChannels=textInChannels
    self.embDim=embeddingDimension
    self.numHeads=numHeads
    self.numEncoderLayers=numEncoderLayers
    self.numClasses=numClasses

    self.vgg13=self.freezeVGG(originalPreTrainedVGG=preTrainedVGG)

    self.encoderLayer=nn.TransformerEncoderLayer(d_model=self.embDim,nhead=self.numHeads)
    self.encoderBlock=nn.TransformerEncoder(self.encoderLayer,num_layers=self.numEncoderLayers)
    self.kimConv0=nn.Conv2d(in_channels=self.textInChannels,out_channels=100,kernel_size=(2,self.embDim))
    self.kimConv1=nn.Conv2d(in_channels=self.textInChannels,out_channels=100,kernel_size=(3,self.embDim))
    self.kimConv2=nn.Conv2d(in_channels=self.textInChannels,out_channels=100,kernel_size=(4,self.embDim))
    self.kimConv3=nn.Conv2d(in_channels=self.textInChannels,out_channels=100,kernel_size=(5,self.embDim))
    self.dropoutLayer=nn.Dropout(p=0.5)
    self.fc=nn.Linear(800,self.numClasses)

  def forward(self,textInput,imgInput):
    textInput=textInput.squeeze(dim=1)
    textInput=textInput.transpose(1,0)
    encoderOutput=self.encoderBlock(textInput)
    encoderOutput=encoderOutput.transpose(1,0)
    
    kimInput=encoderOutput.unsqueeze(1)
    
    conv0_Output=F.relu(self.kimConv0(kimInput)).squeeze(3)
    conv1_Output=F.relu(self.kimConv1(kimInput)).squeeze(3)
    conv2_Output=F.relu(self.kimConv2(kimInput)).squeeze(3)
    conv3_Output=F.relu(self.kimConv3(kimInput)).squeeze(3)
    
    conv0_Output=F.max_pool1d(conv0_Output,conv0_Output.size(2))
    conv1_Output=F.max_pool1d(conv1_Output,conv1_Output.size(2))
    conv2_Output=F.max_pool1d(conv2_Output,conv2_Output.size(2))
    conv3_Output=F.max_pool1d(conv3_Output,conv3_Output.size(2))

    kimOutput=torch.cat((conv0_Output.squeeze(dim=2),conv1_Output.squeeze(dim=2),conv2_Output.squeeze(dim=2),conv3_Output.squeeze(dim=2)),dim=1)

    imgOutput=self.vgg13(imgInput)

    combinedOutput=torch.cat((kimOutput,imgOutput),1)

    output=self.fc(self.dropoutLayer(combinedOutput))

    return output

  def freezeVGG(self,originalPreTrainedVGG):
    count=0
    for param in originalPreTrainedVGG.features.parameters():
      if count<14:
        param.requires_grad=False
      count=count+1
    features=list(originalPreTrainedVGG.classifier.children())[:-3]
    features[3]=nn.Linear(4096,400)
    originalPreTrainedVGG.classifier = nn.Sequential(*features)

    return originalPreTrainedVGG

Adam was used as an optimizer with a learning rate of 10^-4 and Cross Entropy Loss was used as the loss function.

In [None]:
originalPreTrainedVGG=models.vgg13(pretrained=True)
multiModalModel=MultiModalNetwork(preTrainedVGG=originalPreTrainedVGG)
multiModalModel.to(device)
softmaxLoss = nn.CrossEntropyLoss()
optimizer = optim.Adam(multiModalModel.parameters(), lr=0.0001)

Downloading: "https://download.pytorch.org/models/vgg13-c768596a.pth" to /root/.cache/torch/hub/checkpoints/vgg13-c768596a.pth


HBox(children=(FloatProgress(value=0.0, max=532194478.0), HTML(value='')))


