## Tips for putting it all together

Note that these methods can be combined into different parts to make it more efficient: 
1. Get the model and store it in a variable.
2. Transform and create batched inputs separately.
3. Generate visual embeddings from the detectron on the batched inputs and models.

Ideally, you want to build a class around this for ease of use - The class should contain all the methods, the model and the configuration details. And it should process a batch of images and convert to embeddings.

In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import torch
import pickle

In [2]:
data_dir = r'E:\datasets\MADE\3_graduation\parthplc\archive\data\\'

train_path = data_dir + 'train.jsonl'
dev_path = data_dir + 'dev.jsonl'


train_data = pd.read_json(train_path, lines=True)
test_data = pd.read_json(dev_path, lines=True)

test_data.head(3)

Unnamed: 0,id,img,label,text
0,8291,img/08291.png,1,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...


In [3]:
with open('d:\\visual_embeddings_val.pkl', 'rb') as f:
    visual_embeddings_val = pickle.load(f)
    
val_dict = {}
for x in test_data.values:
    if x[1] in visual_embeddings_val:
        val_dict[x[1]] = {'label':x[2], 'text':x[3]}
    
    
with open('d:\\val\\labels_val.pkl', 'wb') as f:
    pickle.dump(val_dict, f)

In [4]:

visual_embeddings_val

{'img/08291.png': [tensor([[-2.2958, -1.1677,  0.1613,  ..., -1.2276, -0.1814,  0.5142],
          [-1.9653, -1.3702,  0.2159,  ..., -1.1430,  0.0888,  0.5217],
          [-1.5618, -1.3221,  0.4325,  ..., -1.0300,  0.7415,  0.3240],
          ...,
          [ 0.2781, -1.0246, -0.0733,  ..., -1.1198,  0.8832,  0.9041],
          [ 0.3066, -1.0001, -0.0704,  ..., -1.1103,  0.8830,  0.9073],
          [ 0.1611, -0.8659,  0.0338,  ..., -1.0664,  1.0999,  0.9515]])],
 'img/46971.png': [tensor([[-1.4779, -1.2810,  0.4201,  ..., -1.0203,  0.6438,  0.2864],
          [-1.5093, -1.3309,  0.4278,  ..., -1.0137,  0.6604,  0.3093],
          [ 0.1588, -1.5820, -0.5154,  ..., -1.7383,  0.5477,  0.6958],
          ...,
          [ 0.2460, -0.7755,  0.9528,  ..., -0.2698,  0.0770,  0.9201],
          [-0.5986, -1.3669,  0.5165,  ..., -1.2930,  0.7644,  1.1766],
          [-0.4720, -0.8018, -0.3053,  ..., -1.7985, -0.7004,  0.0478]])],
 'img/03745.png': [tensor([[-1.9305, -1.3811,  0.4701,  ..., -1.29

In [5]:
from transformers import BertTokenizer, VisualBertForPreTraining

In [6]:
visual_embeddings_val['img/08291.png']

[tensor([[-2.2958, -1.1677,  0.1613,  ..., -1.2276, -0.1814,  0.5142],
         [-1.9653, -1.3702,  0.2159,  ..., -1.1430,  0.0888,  0.5217],
         [-1.5618, -1.3221,  0.4325,  ..., -1.0300,  0.7415,  0.3240],
         ...,
         [ 0.2781, -1.0246, -0.0733,  ..., -1.1198,  0.8832,  0.9041],
         [ 0.3066, -1.0001, -0.0704,  ..., -1.1103,  0.8830,  0.9073],
         [ 0.1611, -0.8659,  0.0338,  ..., -1.0664,  1.0999,  0.9515]])]

In [7]:
val_dict['img/08291.png']

{'label': 1, 'text': 'white people is this a shooting range'}

## Using the embeddings with VisualBert

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre') # this checkpoint has 1024 dimensional visual embeddings projection

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [11]:
model = model.to(device)


In [12]:
for param in model.parameters():
    param.requires_grad = False

In [13]:
model

VisualBertForPreTraining(
  (visual_bert): VisualBertModel(
    (embeddings): VisualBertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (visual_token_type_embeddings): Embedding(2, 768)
      (visual_position_embeddings): Embedding(512, 768)
      (visual_projection): Linear(in_features=1024, out_features=768, bias=True)
    )
    (encoder): VisualBertEncoder(
      (layer): ModuleList(
        (0): VisualBertLayer(
          (attention): VisualBertAttention(
            (self): VisualBertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (

In [14]:
outputs_val = {}

for i, k in enumerate(tqdm(visual_embeddings_val)):
    visual_embeds = visual_embeddings_val[k][0].to(device)
    text = val_dict[k]['text']
    
    tokens = tokenizer([text], padding='max_length', max_length=77)
    
    input_ids = torch.tensor(tokens["input_ids"], device=device)
    attention_mask = torch.tensor(tokens["attention_mask"], device=device)
    token_type_ids = torch.tensor(tokens["token_type_ids"], device=device)
    
    visual_embeds = torch.stack([visual_embeds])
    visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)
    visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)
    
    outputs = model(input_ids=input_ids, 
                    attention_mask=attention_mask, 
                    token_type_ids=token_type_ids, 
                    visual_embeds=visual_embeds, 
                    visual_attention_mask=visual_attention_mask, 
                    visual_token_type_ids=visual_token_type_ids)
    
    o = torch.tensor(outputs.prediction_logits.cpu().flatten(), dtype=torch.float16)
    outputs_val[k] = o
    
    if (i + 1) % 50 == 0:        
        with open(f'd:\\val\\outputs_val_{i}.pkl', 'wb') as f:
            pickle.dump(outputs_val, f)
        outputs_val = {}

if len(outputs_val) > 0:
    with open(f'd:\\val\\outputs_val_{i}.pkl', 'wb') as f:
        pickle.dump(outputs_val, f)

  0%|          | 0/498 [00:00<?, ?it/s]

  o = torch.tensor(outputs.prediction_logits.cpu().flatten(), dtype=torch.float16)


In [15]:
#assert False

In [16]:
with open('d:\\visual_embeddings_train.pkl', 'rb') as f:
    visual_embeddings_train = pickle.load(f)
    
train_dict = {}
for x in train_data.values:
    if x[1] in visual_embeddings_train:
        train_dict[x[1]] = {'label':x[2], 'text':x[3]}
    
with open('d:\\train\\labels_train.pkl', 'wb') as f:
    pickle.dump(train_dict, f)

In [18]:
outputs_train = {}

for i, k in enumerate(tqdm(visual_embeddings_train)):
    visual_embeds = visual_embeddings_train[k][0].to(device)
    text = train_dict[k]['text']
    
    tokens = tokenizer([text], padding='max_length', max_length=77)
    
    input_ids = torch.tensor(tokens["input_ids"], device=device)
    attention_mask = torch.tensor(tokens["attention_mask"], device=device)
    token_type_ids = torch.tensor(tokens["token_type_ids"], device=device)
    
    visual_embeds = torch.stack([visual_embeds])
    visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)
    visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)
    
    outputs = model(input_ids=input_ids, 
                    attention_mask=attention_mask, 
                    token_type_ids=token_type_ids, 
                    visual_embeds=visual_embeds, 
                    visual_attention_mask=visual_attention_mask, 
                    visual_token_type_ids=visual_token_type_ids)
    
    o = torch.tensor(outputs.prediction_logits.cpu().flatten(), dtype=torch.float16)
    outputs_train[k] = o
    
    if (i + 1) % 50 == 0:        
        with open(f'd:\\train\\outputs_train_{i}.pkl', 'wb') as f:
            pickle.dump(outputs_train, f)
        outputs_train = {}
        
if len(outputs_train) > 0:
    with open(f'd:\\train\\outputs_train_{i}.pkl', 'wb') as f:
        pickle.dump(outputs_train, f)

  0%|          | 0/8464 [00:00<?, ?it/s]

  o = torch.tensor(outputs.prediction_logits.cpu().flatten(), dtype=torch.float16)
