In [None]:
import pickle
import json
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gensim
import gensim.downloader as api
import torchtext
import torch
from torch import nn
import math

In [None]:
path = '../input/minivqaiust/'

In [None]:
with open(path + "image_features.pickle", 'rb') as f:
    img = pickle.load(f)
with open(path + "image_question.json") as json_file:
      img_q = json.load(json_file)

In [None]:
df = pd.read_csv(path + "train.csv")
q_train_idx = list(df['question_id'])
label_train = list(df['label'])
df = pd.read_csv(path + "val.csv")
q_val_idx = list(df['question_id'])
label_val = list(df['label'])
df = pd.read_csv(path + "test.csv")
q_test_idx = list(df['question_id'])

In [None]:
questions_train = []
image_features_train = []
all_qs = {}

#change format for better performing
for idx, imq in img_q.items():
  for ques in imq:
    all_qs[ques[0]] = {'question':ques[1], 'image_id': str(idx)}
all_qs[131087000]

In [None]:
for idx in q_train_idx:
  questions_train.append(all_qs[idx]['question'])
  image_features_train.append(img[all_qs[idx]['image_id']])

In [None]:
questions_val = []
image_features_val = []

for idx in q_val_idx:
  questions_val.append(all_qs[idx]['question'])
  image_features_val.append(img[all_qs[idx]['image_id']])

In [None]:
questions_test = []
image_features_test = []

for idx in q_test_idx:
  questions_test.append(all_qs[idx]['question'])
  image_features_test.append(img[all_qs[idx]['image_id']])

In [None]:
class TextEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, q):
        x = self.embedding(q)
        return x

In [None]:
pre_model = api.load('word2vec-google-news-300')

In [None]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [None]:
vocab = list(pre_model.vocab.keys())
embed_size = len(pre_model.get_vector('me'))
weights = torch.from_numpy(pre_model.vectors)

In [None]:
word_dict = {}
for idx , word in enumerate(vocab):
  word_dict[word] = idx

In [None]:
process_text = TextEmbedding(
    vocab_size = len(vocab) + 1, 
    embed_dim = embed_size
)

In [None]:
def encode(seq):
  code = []
  for tok in tokenizer(seq):
    try:
      code.append(word_dict[tok])
    except:
      code.append(len(vocab))
  return code

In [None]:
def padify(b):
  v = [encode(x) for x in b]
  l = max(map(len,v))
  return torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])

In [None]:
train_question_pad = padify(questions_train)
val_question_pad = padify(questions_val)

In [None]:
test_question_pad = padify(questions_test)

In [None]:
with torch.no_grad():
  question_embedd_train = process_text(train_question_pad)
with torch.no_grad():
  question_embedd_val = process_text(val_question_pad)

In [None]:
with torch.no_grad():
  question_embedd_test = process_text(test_question_pad)

In [None]:
class LxmertAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads):
        super().__init__()
        if hidden_size % num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({hidden_size}) is not a multiple of the number of attention "
                f"heads ({num_attention_heads})"
            )
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(hidden_size / num_attention_heads)
        self.head_size = self.num_attention_heads * self.attention_head_size

        ctx_dim = hidden_size
        self.query = nn.Linear(hidden_size, self.head_size)
        self.key = nn.Linear(ctx_dim, self.head_size)
        self.value = nn.Linear(ctx_dim, self.head_size)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (
            self.num_attention_heads,
            self.attention_head_size,
        )
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, context):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(context)
        mixed_value_layer = self.value(context)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        outputs = context_layer
        return outputs

In [None]:
class LxmertCrossAttentionLayer(nn.Module):
    def __init__(self,hidden_size, num_attention_heads):
        super().__init__()
        self.att = LxmertAttention(hidden_size, num_attention_heads)
        self.linear = nn.Linear(hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.norm1 = nn.LayerNorm(hidden_size, eps=1e-6)
        self.norm2 = nn.LayerNorm(hidden_size, eps=1e-6)
        self.linear = nn.Linear(hidden_size, hidden_size)

    def forward(self, input_tensor, ctx_tensor):
        output = self.att(input_tensor, ctx_tensor)
        norm1 = self.norm1(input_tensor + output)
        lin = self.linear(norm1)
        return lin

In [None]:
train_dataset = torch.utils.data.TensorDataset(question_embedd_train, torch.tensor(image_features_train), torch.tensor(label_train))
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
val_dataset = torch.utils.data.TensorDataset(question_embedd_val, torch.tensor(image_features_val), torch.tensor(label_val))
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=True)

In [None]:
class VQA3(nn.Module):
    def __init__(self, features_size, num_attention_heads):
        super(type(self), self).__init__()
        #text feature
        self.lstm = nn.LSTM(300, features_size, num_layers=2, batch_first = True)
        #cross attention
        self.cross = LxmertCrossAttentionLayer(features_size, num_attention_heads)
        #final output
        self.linear1 = nn.Linear(features_size, 10)
        self.batchnorm = nn.BatchNorm1d(features_size)
                
        
    def forward(self, text, image):
        text_f = self.lstm(text)[0]
        text_f = torch.mean(text_f,1)
        text_f = torch.reshape(text_f, (text_f.shape[0], 1, text_f.shape[1]))
        image = torch.reshape(image, (image.shape[0], 1, image.shape[1]))
        crossatt = self.cross(text_f, image)
        crossatt = torch.reshape(crossatt,(crossatt.shape[0], crossatt.shape[2]))
        crossatt = self.batchnorm(crossatt)
        crossatt = self.linear1(crossatt)
        logits = nn.functional.softmax(crossatt, dim=1)
        return logits

In [None]:
main_model = VQA3(512, 4)

In [None]:
learning_rate = 0.3
epochs = 30
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(main_model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (text, image, y) in enumerate(dataloader):        
        # Compute prediction and loss
        pred = model(text, image)
        #print('hello')
        loss = loss_fn(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print(batch)

        if batch % 2 == 0:
            loss, current = loss.item(), batch * len(text)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


In [None]:
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for text, image, y in dataloader:
            pred = model(text, image)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            
    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, main_model, loss_fn, optimizer)
    test_loop(val_dataloader, main_model, loss_fn)
    scheduler.step()
print("Done!")

In [None]:
y = main_model(question_embedd_test, torch.tensor(image_features_test))

In [None]:
results = [int(out.argmax(0).numpy()) for out in y]
labeldict = {}
labeldict['question_id'] = q_test_idx
labeldict['label'] = []
for idx, out in enumerate(results):
  labeldict['label'].append(int(out))

In [None]:
dfl = pd.DataFrame(labeldict)  

In [None]:
dfl

In [None]:
dfl.to_csv(path + 'testvqa3_again.csv', index=False)