In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import torch
import pickle
import time
from torch.utils.data import DataLoader
import optuna
from transformers import BertTokenizer, VisualBertForPreTraining
import numpy as np

In [2]:
data_dir = r'E:\datasets\MADE\3_graduation\parthplc\archive\data\\'

train_path = data_dir + 'train.jsonl'
dev_path = data_dir + 'dev.jsonl'


train_data = pd.read_json(train_path, lines=True)
test_data = pd.read_json(dev_path, lines=True)

test_data.head(3)

Unnamed: 0,id,img,label,text
0,8291,img/08291.png,1,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...


In [3]:
with open('d:\\visual_embeddings_val.pkl', 'rb') as f:
    visual_embeddings_val = pickle.load(f)
    
val_dict = {}
for x in test_data.values:
    if x[1] in visual_embeddings_val:
        val_dict[x[1]] = {'label':x[2], 'text':x[3], 'id':x[1]}


In [4]:
visual_embeddings_val['img/08291.png'][0].shape

torch.Size([100, 1024])

In [5]:
class FeaturesDataset(torch.utils.data.Dataset):
    def __init__(self, visual_embeddings, labels):
        self.visual_embeddings = visual_embeddings
        self.labels = labels
        
        self.idx2id = [{'id':k, 'label':labels[k]['label'], 'text':labels[k]['text']}
                       for i, k in enumerate(labels)]
    
    
    def __getitem__(self, index: int):
        id = self.idx2id[index]['id']
        return id, self.visual_embeddings[id][0], self.labels[id]['text'][:77], self.labels[id]['label']

    
    def __len__(self):
        return len(self.idx2id)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
features_val_dataset = FeaturesDataset(visual_embeddings_val, val_dict)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [9]:
loss = torch.nn.CrossEntropyLoss()

In [10]:
model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre')
for i, param in enumerate(model.parameters()):
    param.requires_grad = False


model.cls.predictions.decoder = torch.nn.Linear(in_features=768, out_features=2, bias=True)
model = model.to(device)

In [16]:
model.eval()
for i_step, (id, visual_embeds, text, y) in enumerate(DataLoader(features_val_dataset, batch_size=1)):
    visual_embeds = visual_embeds.to(device)    
    tokens = tokenizer(list(text), padding='max_length', max_length=77)

    input_ids = torch.tensor(tokens["input_ids"], device=device)
    attention_mask = torch.tensor(tokens["attention_mask"], device=device)
    token_type_ids = torch.tensor(tokens["token_type_ids"], device=device)

    visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)
    visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)


    outputs = model(input_ids=input_ids, 
                      attention_mask=attention_mask, 
                      token_type_ids=token_type_ids, 
                      visual_embeds=visual_embeds, 
                      visual_attention_mask=visual_attention_mask, 
                      visual_token_type_ids=visual_token_type_ids
                  )

    prediction = outputs.prediction_logits
    break

In [17]:
prediction.shape

torch.Size([1, 177, 2])

In [18]:
outputs

VisualBertForPreTrainingOutput(loss=None, prediction_logits=tensor([[[ 0.3972, -0.8051],
         [-0.3532, -1.6152],
         [-0.6077, -0.4447],
         [-0.0478, -1.1056],
         [ 0.8540, -0.3323],
         [-0.8882, -2.1188],
         [-0.3672,  0.0482],
         [ 0.4527, -0.0895],
         [ 0.9576,  1.2770],
         [ 1.0655, -0.2704],
         [ 1.0107, -0.3919],
         [ 0.8331, -0.3004],
         [ 0.8630, -0.4667],
         [ 0.7233, -0.4353],
         [ 0.6346, -0.5200],
         [ 0.8346, -0.5998],
         [ 0.8405, -0.4731],
         [ 0.9949, -0.4285],
         [ 1.1617, -0.5671],
         [ 0.8890, -0.6670],
         [ 0.9999, -0.4627],
         [ 0.6701, -0.2886],
         [ 0.7732, -0.0410],
         [ 0.6819, -0.0751],
         [ 0.4768, -0.2325],
         [ 0.0448, -0.5686],
         [-0.1948, -0.5440],
         [-0.1687, -0.4639],
         [ 0.1521, -0.3796],
         [ 0.6316, -0.1558],
         [ 0.8574, -0.1693],
         [ 1.0308, -0.1524],
         [ 0

In [19]:
outputs.prediction_logits.sum(axis=1).shape

torch.Size([1, 2])

In [20]:
outputs.prediction_logits.sum(axis=1)

tensor([[  85.9483, -118.9581]], device='cuda:0', grad_fn=<SumBackward1>)