In [0]:
#! git clone https://github.com/HarishMashetty/VisualQuestionAnswering.git

In [0]:
#!chmod 777 /content/VisualQuestionAnswering/vqa-winner-cvprw-2017/scripts/download_extract.sh

In [0]:
#!/content/VisualQuestionAnswering/vqa-winner-cvprw-2017/scripts/download_extract.sh

In [0]:
#!python /content/VisualQuestionAnswering/vqa-winner-cvprw-2017/scripts/preproc.py

In [0]:

#!/content/VisualQuestionAnswering/vqa-winner-cvprw-2017/scripts/download_extract.sh

**Check resources available on Colab GPU**

In [0]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 12.9 GB  | Proc size: 142.8 MB
GPU RAM Free: 11441MB | Used: 0MB | Util   0% | Total 11441MB


In [0]:
#from google.colab import drive
#drive.mount('/content/drive')

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import numpy as np
import pickle
from sklearn.externals import joblib

device = torch.device('cuda')

if torch.cuda.is_available():
  print ("CUDA is available")
else:
  raise Exception('CUDA not found')

torch.cuda.manual_seed(0)
glove_embedding_dimension=300


# Data loader settings
pin_memory =True
n_workers = 4
train_batch_size=256

#Model architecture settings
hidden_dimension=512

# Training settings
max_epochs =10


globe_pretrained_file="/content/data/glove_pretrained_300.npy"


CUDA is available


# Run Model

In [0]:
# load dictionary files

# dictionary of all question word tokens, both training and validation. example idx2word = ['does', 'the', 'guy', 'have', 'a', 'tattoo' ...]
idx2word, word2idx = pickle.load(open('/content/data/dict_q.pkl', 'rb') )

# dictionary of all answers, both training and validation. example idx2ans = ['blue','cell phone','blue and white'....]
idx2ans, ans2idx = pickle.load(open('/content/data/dict_ans.pkl', 'rb') )

vocab_size=len(idx2word)
n_classes=len(idx2ans)


# load preprocessed training files
question_answers = pickle.load(open('/content/data/train_qa.pkl', 'rb') )
visual_features = joblib.load(open('/content/data/train_vfeats.pkl', 'rb') ) 



In [0]:
n_classes


    

475

**Create Dataset class , which feeds data to Data loader**

In [0]:
import torch
from torch.utils import data
from torch.utils.data import Dataset

class vqa_dataset(data.Dataset):

  def __init__(self, question_answers, visual_features,vocab_size,word2idx,n_classes):
    self.question_answers = question_answers
    self.visual_features = visual_features
    self.data_list = []
    
    max_words=14
    
    for q in question_answers:

      # build question vector
      # question tokened = ['does', 'the', 'guy', 'have', 'a', 'tattoo']
      # question vector, dictionaty index, which is used to get glove embeddings using nn.Embedding as a lookup table
      # question vector = [ 7,  8, 38, 39,  16, 23,  0,  0,  0,  0,  0,  0,  0,  0] , indices corresponding to
      #                   the words in dictionary, using these indices we do a lookup in pretrained golve dict
  
      question_vector = np.zeros(max_words, dtype=np.int64)

      for i, question_token in enumerate(q['question_toked']):
          if question_token in word2idx and i < 14:
            question_vector[i] = word2idx[question_token]

      # build answer vector
      # answer answer tuple input = ('yes', 1)
      # answer vector = 

      answer_vector = np.zeros(n_classes, dtype=np.float32)
      for answer, score in q['answer']:
        answer_vector[ans2idx[answer]] = score

      if q['image_id'] in visual_features:
        self.data_list.append({ 'image': visual_features[q['image_id']], 'question': question_vector,'answer': answer_vector})
        

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.question_answers)

  def __getitem__(self, index):
    'Generates one sample of data'
    # Select sample
    selected = self.data_list[index]
    
    return selected['image'], selected['question'], selected['answer']

In [0]:
training_set = vqa_dataset(question_answers, visual_features,vocab_size,word2idx,n_classes)
v,q,a=training_set.__getitem__(0)


a.shape

(475,)

In [0]:
class VQA_Model(nn.Module):


  # constructor
  def __init__(self, vocab_size, globe_pretrained_file, hidden_dimension, num_classes,embedding_dimension):
    
    super(VQA_Model, self).__init__()    
    
    self.hidden_dimension=hidden_dimension
    
    # Question embedding
    self.nn_emedding = nn.Embedding(vocab_size + 1,embedding_dimension )
    self.gru = nn.GRU(embedding_dimension, hidden_dimension)

    
    # treat nn.Embedding as a lookup table where the key is the word index and # the value is the corresponding word vector.
    #Specify the #size of the lookup table, and initialize the word vectors yourself.


    # vocab size + 1
    pretrained_glove_weights = np.zeros((vocab_size + 1, embedding_dimension), dtype=np.float32)

    # last row of the ndarray is kept 0
    pretrained_glove_weights[:vocab_size] = np.load(globe_pretrained_file)

    # copy weights to nn_embedding, last row of the weights nn_emedding.weight is 0 vector
    self.nn_emedding.weight.data.copy_(torch.from_numpy(pretrained_glove_weights))
    

    self.linear_layer_scalar_attention = nn.Linear(hidden_dimension, 1)

    # classifier

    self.classifier_linear = nn.Linear(hidden_dimension, num_classes)
    #self.clf_dropout = nn.Dropout(0.5, inplace=True)

    # Linear layers for, non linear activation

    # gated tanh for top down attention
    # to calculate fa([vi; q])
    self.linear_layer_fa_s = nn.Linear(2048 + hidden_dimension, hidden_dimension)
    self.linear_layer_fa_t = nn.Linear(2048 + hidden_dimension, hidden_dimension)
    

    
    # to calculate fq(q)
    self.linear_layer_q_s = nn.Linear(hidden_dimension, hidden_dimension)
    self.linear_layer_q_t = nn.Linear(hidden_dimension, hidden_dimension)
    
    # to calculate fv(^v)
    self.linear_layer_v_hat_s = nn.Linear(2048, hidden_dimension)
    self.linear_layer_v_hat_t = nn.Linear(2048, hidden_dimension)
    
    # to calculate fo(h)
    self.linear_layer_o_s = nn.Linear(hidden_dimension, hidden_dimension)
    self.linear_layer_o_t = nn.Linear(hidden_dimension, hidden_dimension)
    

  def forward(self, image, question):
    
    # Shape (batch_size, 14 question words, 300 size embedding for each word)
    question_embedding = self.nn_emedding(question)  
    
    # gru encoding shape (14 , batch_size, hidden dimension)
    gru_encoding, gru_hidden = self.gru(question_embedding.permute(1, 0, 2))
    
    # question encoding (batch_size, hidden dimension)
    question_encoding = gru_encoding[-1]
    question_reshape = question_encoding.repeat(1, 36).view(-1, 36, self.hidden_dimension)

    
    # Implement a-i (scalar attention)  = wa * fa ( concatenation of [v-i; q] )
    image = F.normalize(image, -1)
    
    #torch.cat(tensors, dim) , dim = dimension over which the tensors are concatenated
    concatenation= torch.cat((image,question_reshape),-1)
    
    concatenation = torch.mul(torch.tanh(self.linear_layer_fa_s(concatenation)), torch.sigmoid(self.linear_layer_fa_s(concatenation)))
    
    
    scalar_attention = self.linear_layer_scalar_attention(concatenation)    
    
    # alpha = softmax (a)
    
    scalar_attention_softmax = F.softmax(scalar_attention.squeeze(), dim=1)
        
    # v hat (attended image) = comvex combination of (alpha-i * v-i)

    attended_image = torch.bmm(scalar_attention_softmax.unsqueeze(1), image).squeeze()

    # Multimodal fusion h = fq(q) * fv(v hat)
    
    f_v_hat  = torch.mul(torch.tanh(self.linear_layer_v_hat_t(attended_image)), torch.sigmoid(self.linear_layer_v_hat_s(attended_image)))
    f_q  = torch.mul(torch.tanh(self.linear_layer_q_t(question_encoding)), torch.sigmoid(self.linear_layer_q_s(question_encoding)))

    multimodal_fusion =  torch.mul(f_q, f_v_hat)
    
    # s = sigmoid (wo * fo(h) )
    s = self.classifier_linear(torch.mul(torch.tanh(self.linear_layer_o_t(multimodal_fusion)), torch.sigmoid(self.linear_layer_o_s(multimodal_fusion))))
    
    return s

    
    



In [0]:
print ("Question Vocabulary size:" + str(vocab_size))
print ("hidden layer dimension:" + str(hidden_dimension))
print ("number of classes for classification:" + str(n_classes))

model = VQA_Model(vocab_size, globe_pretrained_file, hidden_dimension, n_classes,glove_embedding_dimension)
model = nn.DataParallel(model).to(device)

Question Vocabulary size:7069
hidden layer dimension:512
number of classes for classification:475


In [0]:
input = torch.randn(3, requires_grad=True)
target = torch.empty(3).random_(2)
loss = F.binary_cross_entropy_with_logits(input, target)
print (loss)

tensor(1.0193, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


tensor([3, 1, 3])


In [0]:
# Generators
training_set = vqa_dataset(question_answers, visual_features,vocab_size,word2idx,n_classes)
training_generator = data.DataLoader(training_set,train_batch_size, shuffle=True, num_workers =n_workers, pin_memory=pin_memory)



Question Vocabulary size:7069
hidden layer dimension:512
number of classes for classification:475


In [0]:
def compute_score(logits, labels):
    logits = torch.max(logits, 1)[1].data
    if torch.cuda.is_available():
        one_hots = torch.zeros(*labels.size()).cuda()
    else:
        one_hots = torch.zeros(*labels.size())
    one_hots.scatter_(1, logits.view(-1, 1), 1)
    score = (one_hots * labels)
    return score.cpu().numpy().sum() / logits.shape[0]

In [0]:
optim = torch.optim.Adamax(model.parameters())

# Loop over epochs
for epoch in range(max_epochs):
  for local_image, local_question,local_answer in training_generator:
        # Transfer to GPU
        local_image, local_question,local_answer = local_image.to(device), local_question.to(device),local_answer.to(device)
        
        logits = model(local_image, local_question)
        
        #print (model)
        #print (epoch,len(local_answer))
        
        #print (logits)
        
        loss = F.binary_cross_entropy_with_logits(logits, local_answer) * local_answer.size(1)


        optim.zero_grad()
        loss.backward()
        #nn.utils.clip_grad_norm_(model.parameters(), 0.25)
        optim.step()

        score = compute_score(logits, local_answer)
  print ("Epoch : " + str(1) + ", score :" + str (score) )
        
        


Epoch : epoch, score :0.31286550823010895
Epoch : epoch, score :0.35526312443248015
Epoch : epoch, score :0.40935670283802766
Epoch : epoch, score :0.42543859649122806
Epoch : epoch, score :0.4356725257739686
Epoch : epoch, score :0.42543859649122806
Epoch : epoch, score :0.45175445288942573
Epoch : epoch, score :0.5087719298245614
Epoch : epoch, score :0.47222221106813667
Epoch : epoch, score :0.49269010309587447


In [0]:
device

device(type='cuda')

**Build Model**

In [0]:
 for local_image, local_question,local_answer in training_generator:
    print (len(local_answer))

256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256
256


KeyboardInterrupt: ignored

In [0]:
# treat nn.Embedding as a lookup table where the key is the word index and # the value is the corresponding word vector.
#Specify the #size of the lookup table, and initialize the word vectors yourself.

pretrained_glove_weights = np.zeros((vocab_size + 1, glove_embedding_dimension), dtype=np.float32)

# last row of the ndarray is kept 0
pretrained_glove_weights[:vocab_size] = np.load(globe_pretrained_file)

# copy weights to nn_embedding, last row of the weights nn_emedding.weight is 0 vector
nn_emedding.weight.data.copy_(torch.from_numpy(pretrained_glove_weights))

tensor([[-0.1570,  0.2190, -0.1663,  ..., -0.1872,  0.2802,  0.2936],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        [-0.3004, -0.0474, -0.1218,  ...,  0.5475, -0.0809,  0.3682],
        ...,
        [-0.1160, -0.1870, -0.4214,  ...,  0.5555,  0.2341, -0.0101],
        [-0.4783, -0.4500, -0.0590,  ...,  0.4060,  0.5669,  0.3657],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

Parameter containing:
tensor([[-0.1570,  0.2190, -0.1663,  ..., -0.1872,  0.2802,  0.2936],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        [-0.3004, -0.0474, -0.1218,  ...,  0.5475, -0.0809,  0.3682],
        ...,
        [-0.1160, -0.1870, -0.4214,  ...,  0.5555,  0.2341, -0.0101],
        [-0.4783, -0.4500, -0.0590,  ...,  0.4060,  0.5669,  0.3657],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       requires_grad=True)

In [0]:
question_answers[0]['question_toked']

['does', 'the', 'guy', 'have', 'a', 'tattoo']

In [0]:
idx2word 

['does',
 'the',
 'guy',
 'have',
 'a',
 'tattoo',
 'what',
 'is',
 'this',
 'man',
 'riding',
 'on',
 'how',
 'many',
 'tattoos',
 'can',
 'be',
 'seen',
 "'s",
 'body',
 'color',
 'his',
 'hat',
 'visor',
 'providing',
 'face',
 'enough',
 'protection',
 'plane',
 'landing',
 'shape',
 'are',
 'windows',
 'side',
 'of',
 'taking',
 'off',
 'water',
 'photo',
 'outdoors',
 'birds',
 'in',
 'sink',
 'planes',
 'tail',
 '4',
 'colors',
 'last',
 'letter',
 'over',
 'these',
 'wings',
 'strong',
 'sign',
 'would',
 'you',
 'able',
 'to',
 'find',
 'mosque',
 'city',
 'language',
 'written',
 'printed',
 'orange',
 'name',
 'hotel',
 'vehicles',
 'shown',
 'mode',
 'transportation',
 'pictured',
 'small',
 'town',
 'front',
 'bus',
 'say',
 'at',
 'top',
 'featured',
 'picture',
 'typical',
 'school',
 'latest',
 'make',
 'and',
 'model',
 'there',
 'any',
 'building',
 'area',
 'church',
 'sunny',
 'or',
 'overcast',
 'snow',
 'roof',
 'shingles',
 'symbol',
 'sits',
 'atop',
 'tower',
 

# New Section

# New Section