In [1]:
!pip install easy-vqa

Collecting easy-vqa
  Downloading easy_vqa-1.0-py3-none-any.whl (3.2 MB)
Installing collected packages: easy-vqa
Successfully installed easy-vqa-1.0


### Preparing the Data

In [8]:
from easy_vqa import get_train_questions, get_test_questions
import pandas as pd
import numpy as np

In [9]:
train_questions, train_answers, train_image_ids = get_train_questions()
test_questions, test_answers, test_image_ids = get_test_questions()

In [11]:
def make_dataframes(questions,answers,image_names,type):
    
    data_info = {"question":[],"answer":[],
                 "image_path":[]}
    
    
    for question,answer,image_name in zip(questions,answers,image_names):
        data_info['question'].append(question)
        data_info['answer'].append(answer)
        data_info['image_path'].append(f"./data/{type}/images/{image_name}.png")
        
        
    return pd.DataFrame(data_info)

In [12]:
train_df = make_dataframes(train_questions, train_answers, train_image_ids,"train")
test_df = make_dataframes(test_questions, test_answers, test_image_ids,"test")

In [13]:
train_df.head()

Unnamed: 0,question,answer,image_path
0,what is the blue shape?,rectangle,./data/train/images/0.png
1,what color is the shape?,blue,./data/train/images/0.png
2,does the image contain a rectangle?,yes,./data/train/images/0.png
3,is there a triangle in the image?,no,./data/train/images/0.png
4,is there a black shape?,no,./data/train/images/0.png


In [14]:
train_df.to_csv("train.csv")
test_df.to_csv("test.csv")

#### Importing Libraries

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModel

#### Config

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
### Loading Transformers

## Language Model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
language_model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
## Vision Model
feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
vision_model = AutoModel.from_pretrained("google/vit-base-patch16-224-in21k")

In [5]:
for parameters in language_model.parameters():
    parameters.requires_grad = False
    
for parameters in vision_model.parameters():
    parameters.requires_grad = False

In [6]:
language_model.to(device)
vision_model.to(device)
print(f"Model move to {device}")

Model move to cuda


In [7]:
label2idx = {'black': 6,
 'blue': 11,
 'brown': 10,
 'circle': 0,
 'gray': 3,
 'green': 1,
 'no': 12,
 'rectangle': 7,
 'red': 2,
 'teal': 5,
 'triangle': 9,
 'yellow': 8,
 'yes': 4}

num_labels = 13

In [8]:
batch_size = 32

### Preparing Dataloaders

In [9]:
from torch.utils.data import Dataset,DataLoader,RandomSampler
from PIL import Image

In [10]:
train_df = pd.read_csv("train.csv",index_col=0)
test_df = pd.read_csv("test.csv",index_col=0)

In [11]:
train_df.head()

Unnamed: 0,question,answer,image_path
0,what is the blue shape?,rectangle,./data/train/images/0.png
1,what color is the shape?,blue,./data/train/images/0.png
2,does the image contain a rectangle?,yes,./data/train/images/0.png
3,is there a triangle in the image?,no,./data/train/images/0.png
4,is there a black shape?,no,./data/train/images/0.png


In [12]:
class VQADataset(Dataset):
    
    def __init__(self,df,tokenizer,feature_extractor,lm,vm,label2idx):
        
        self.data = df
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.lm = lm
        self.vm = vm
        self.label2idx= label2idx
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        
        df_row = self.data.iloc[idx]
        
        question = df_row['question']
        image_path = df_row['image_path']
        
        label = self.label2idx.get(df_row['answer'].strip()) ## Encoding
        
        image = Image.open(image_path).convert("RGB")
        
        text_input = self.tokenizer(question,return_tensors="pt")
        image_input = self.feature_extractor(image,return_tensors = "pt")
        
        text_input ={k:v.to(device) for k,v in text_input.items()}
        image_input ={k:v.to(device) for k,v in image_input.items()}
        
        text_tensors = self.lm(**text_input)
        image_tensors = self.vm(**image_input)
        
        text_tensors = text_tensors.pooler_output.view(-1).detach().cpu()
        image_tensors = image_tensors.pooler_output.view(-1).detach().cpu()
        label = torch.tensor(label,dtype =torch.long)
        
        
        return {"image_embeddings":image_tensors,"text_embeddings":text_tensors,"label":label}
                
                

In [13]:
train_dataset = VQADataset(
                train_df,tokenizer,feature_extractor,
                language_model,vision_model,label2idx
)
test_dataset = VQADataset(
                test_df,tokenizer,feature_extractor,
                language_model,vision_model,label2idx
)

In [14]:
out = train_dataset[0]

In [15]:
out['image_embeddings'].size(),out['text_embeddings'].size(),out['label'].size()

(torch.Size([768]), torch.Size([768]), torch.Size([]))

### Developing the Model

In [16]:
import math

In [17]:
class VQAModel(nn.Module):
    
    def __init__(self,num_labels):
        
        super(VQAModel,self).__init__()
        
        self.num_labels = num_labels
        self.fc1 = nn.Linear(768,256)
        self.bn1 = nn.BatchNorm1d(256)
        
        self.relu = nn.ReLU()
        self.dropout=nn.Dropout(0.3)
        
        self.final_layer = nn.Linear(256,num_labels)
        self.parameter = nn.Parameter(torch.Tensor(768,768))
        
        nn.init.kaiming_uniform_(self.parameter, a=math.sqrt(5))
        
    def forward(self,image_embeddings,text_embeddings):
        
        im1 = torch.nn.functional.normalize(image_embeddings)
        te = torch.nn.functional.normalize(text_embeddings)
        
        cross = im1 * te
        
        weighted = self.relu(torch.mm(cross,self.parameter.t()))
        
        down = self.bn1(self.fc1(weighted))
        
        down = self.dropout(down)
        
        classify = self.final_layer(down)
        
        # loss = self.criterion(classify.view(-1,self.num_labels),label.view(-1))
        
        return classify
        
        
        
        

In [18]:
model = VQAModel(num_labels=num_labels)

In [19]:
model.to(device)
print(f"Model moved to {device}")

Model moved to cuda


In [20]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [21]:

def training_step(model,epochs,training_dataloader,testing_dataloader):
    
    criterion =  nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=5e-5)
    
    best_test_accuracy = 0
    train_loss = []
    testing_loss = []
    
    model.train()
    for epoch in range(epochs):
        
        loss_per_batch = []
        testing_output = []
        true_output = []
        
        
        model.train()
        for batch in tqdm(training_dataloader,desc = "training"):
            batch = {k:v.to(device) for k,v in batch.items()}
            inputs = {"image_embeddings":batch['image_embeddings'],"text_embeddings":batch['text_embeddings']}
            output = model(**inputs)
            
            loss = criterion(output.view(-1,num_labels),batch['label'].view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_per_batch.append(loss.detach().cpu().item())
            
        train_loss.append(np.mean(loss_per_batch))
        
        
        loss_per_batch = []
        model.eval()
        for batch in tqdm(testing_dataloader,desc = "testing"):
            
            
            batch = {k:v.to(device) for k,v in batch.items()}
            inputs = {"image_embeddings":batch['image_embeddings'],"text_embeddings":batch['text_embeddings']}
            output = model(**inputs)
            
            loss = criterion(output.view(-1,num_labels),batch['label'].view(-1))
            
            loss_per_batch.append(loss.detach().cpu().item())
            
            pred = output.argmax(-1).detach().cpu().tolist()
            testing_output.extend(pred)
            true_output.extend(batch['label'].view(-1).detach().cpu().tolist())
            
        
        testing_loss.append(np.mean(loss_per_batch))
          
        accuracy = accuracy_score(true_output,testing_output)
        
        print(f"Epoch {epoch}:- Training Loss : {train_loss[-1]}  Testing Loss: {testing_loss[-1]}  Testing Accuracy: {accuracy}")
        
        if accuracy > best_test_accuracy:
            best_test_accuracy = accuracy
            torch.save(model.state_dict(),"./best.pt")
            
            
    
    return {"train_loss":train_loss,"test_loss":testing_loss}
    

### Preparing for Training

In [22]:
train_dataloader = DataLoader(train_dataset,batch_size = batch_size,sampler = RandomSampler(train_dataset))
test_dataloader = DataLoader(test_dataset,batch_size = batch_size)

In [23]:
history = training_step(model,10,train_dataloader,test_dataloader)

training: 100%|██████████| 1206/1206 [09:02<00:00,  2.22it/s]
testing: 100%|██████████| 303/303 [02:18<00:00,  2.19it/s]


Epoch 0:- Training Loss : 1.4915718832122746  Testing Loss: 0.7940468382914074  Testing Accuracy: 0.7384472242323995


training: 100%|██████████| 1206/1206 [09:13<00:00,  2.18it/s]
testing: 100%|██████████| 303/303 [02:18<00:00,  2.19it/s]


Epoch 1:- Training Loss : 0.739931606682972  Testing Loss: 0.5852659429260606  Testing Accuracy: 0.7909645404734829


training: 100%|██████████| 1206/1206 [08:58<00:00,  2.24it/s]
testing: 100%|██████████| 303/303 [02:05<00:00,  2.42it/s]


Epoch 2:- Training Loss : 0.5843897338579741  Testing Loss: 0.47143954323856746  Testing Accuracy: 0.8510286364106275


training: 100%|██████████| 1206/1206 [09:04<00:00,  2.21it/s]
testing: 100%|██████████| 303/303 [02:19<00:00,  2.18it/s]


Epoch 3:- Training Loss : 0.48729665969092256  Testing Loss: 0.4128781789206829  Testing Accuracy: 0.864571487646025


training: 100%|██████████| 1206/1206 [09:17<00:00,  2.16it/s]
testing: 100%|██████████| 303/303 [02:19<00:00,  2.17it/s]


Epoch 4:- Training Loss : 0.42435762575313224  Testing Loss: 0.37885108260628414  Testing Accuracy: 0.8671560012405666


training: 100%|██████████| 1206/1206 [09:25<00:00,  2.13it/s]
testing: 100%|██████████| 303/303 [02:22<00:00,  2.13it/s]


Epoch 5:- Training Loss : 0.39401178197876535  Testing Loss: 0.35253101577459783  Testing Accuracy: 0.8714979840793963


training: 100%|██████████| 1206/1206 [09:28<00:00,  2.12it/s]
testing: 100%|██████████| 303/303 [02:21<00:00,  2.14it/s]


Epoch 6:- Training Loss : 0.3698178409331572  Testing Loss: 0.3461252653067655  Testing Accuracy: 0.8733588338674662


training: 100%|██████████| 1206/1206 [09:29<00:00,  2.12it/s]
testing: 100%|██████████| 303/303 [02:22<00:00,  2.12it/s]


Epoch 7:- Training Loss : 0.3553437001657229  Testing Loss: 0.3386729648797819  Testing Accuracy: 0.8712912229918329


training: 100%|██████████| 1206/1206 [09:12<00:00,  2.18it/s]
testing: 100%|██████████| 303/303 [02:04<00:00,  2.43it/s]


Epoch 8:- Training Loss : 0.34332823967113224  Testing Loss: 0.325169440641655  Testing Accuracy: 0.8727385506047762


training: 100%|██████████| 1206/1206 [08:18<00:00,  2.42it/s]
testing: 100%|██████████| 303/303 [02:05<00:00,  2.41it/s]

Epoch 9:- Training Loss : 0.3336777048151489  Testing Loss: 0.3196681807733605  Testing Accuracy: 0.8781143388814225



