In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import torch
import pickle
import time
from torch.utils.data import DataLoader
import optuna
from transformers import BertTokenizer, VisualBertForPreTraining
import numpy as np

In [2]:
data_dir = r'E:\datasets\MADE\3_graduation\parthplc\archive\data\\'

train_path = data_dir + 'train.jsonl'
dev_path = data_dir + 'dev.jsonl'


train_data = pd.read_json(train_path, lines=True)
test_data = pd.read_json(dev_path, lines=True)

test_data.head(3)

Unnamed: 0,id,img,label,text
0,8291,img/08291.png,1,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...


In [3]:
with open('d:\\visual_embeddings_val.pkl', 'rb') as f:
    visual_embeddings_val = pickle.load(f)
    
val_dict = {}
for x in test_data.values:
    if x[1] in visual_embeddings_val:
        val_dict[x[1]] = {'label':x[2], 'text':x[3], 'id':x[1]}


In [4]:
with open('d:\\visual_embeddings_train.pkl', 'rb') as f:
    visual_embeddings_train = pickle.load(f)
    
train_dict = {}
for x in train_data.values:
    if x[1] in visual_embeddings_train:
        train_dict[x[1]] = {'label':x[2], 'text':x[3], 'id':x[1]}


In [5]:
class FeaturesDataset(torch.utils.data.Dataset):
    def __init__(self, visual_embeddings, labels):
        self.visual_embeddings = visual_embeddings
        self.labels = labels
        
        self.idx2id = [{'id':k, 'label':labels[k]['label'], 'text':labels[k]['text']}
                       for i, k in enumerate(labels)]
    
    
    def __getitem__(self, index: int):
        id = self.idx2id[index]['id']
        return id, self.visual_embeddings[id][0], self.labels[id]['text'][:77], self.labels[id]['label']

    
    def __len__(self):
        return len(self.idx2id)

In [6]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']


def train_model(model, train_loader, val_loader, loss, optimizer, num_epochs, scheduler, device):
    best_model_name = None
    loss_history = []
    train_history = []
    val_history = []
    top_val_accuracy = 0.64 
    for epoch in range(num_epochs):
        t1 = time.time()
        model.train()
        loss_accum = 0
        correct_samples = 0
        total_samples = 0
        for i_step, (id, visual_embeds, text, y) in enumerate(train_loader):
            y = y.to(device)
            visual_embeds = visual_embeds.to(device)    
            tokens = tokenizer(list(text), padding='max_length', max_length=77)

            input_ids = torch.tensor(tokens["input_ids"], device=device)
            attention_mask = torch.tensor(tokens["attention_mask"], device=device)
            token_type_ids = torch.tensor(tokens["token_type_ids"], device=device)

            visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)
            visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)


            outputs = model(input_ids=input_ids, 
                              attention_mask=attention_mask, 
                              token_type_ids=token_type_ids, 
                              visual_embeds=visual_embeds, 
                              visual_attention_mask=visual_attention_mask, 
                              visual_token_type_ids=visual_token_type_ids
                          )
    
            prediction = outputs.prediction_logits.sum(axis=1)
            
            loss_value = loss(prediction, y)
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
            
            _, indices = torch.max(prediction, 1)
            correct_samples += torch.sum(indices == y)
            total_samples += y.shape[0]
            
            loss_accum += loss_value

        ave_loss = loss_accum / (i_step + 1)
        train_accuracy = float(correct_samples) / total_samples
        val_accuracy = compute_accuracy(model, val_loader, device)
        
        loss_history.append(float(ave_loss))
        train_history.append(train_accuracy)
        val_history.append(val_accuracy)
        if scheduler != None:
            scheduler.step()

        print("Epoch: %i; %.2f sec; lr: %f; Average loss: %.2f, Train accuracy: %.4f, Val accuracy: %.4f" % 
              (epoch, round(time.time() - t1, 2), get_lr(optimizer), ave_loss, train_accuracy, val_accuracy))

  
        if val_accuracy > top_val_accuracy:
            top_val_accuracy = val_accuracy
            model_name = f'classifier_{epoch}_{round(val_accuracy, 3)}.ckpt'
            best_model_name = model_name
            torch.save(model, open(model_name, 'wb'))
            print("saved", model_name)

        if len(val_history) > 4:
            print(f'{(val_history[-1] - val_history[-2]) < 0.001} {(val_history[-2] - val_history[-3]) < 0.001} \
            {(val_history[-3] - val_history[-4]) < 0.001} {(val_history[-4] - val_history[-5]) < 0.001}')
        
        if len(val_history) > 4 and (val_history[-1] - val_history[-2]) < 0.001  and \
                                    (val_history[-2] - val_history[-3]) < 0.001 and \
                                    (val_history[-3] - val_history[-4]) < 0.001 and \
                                    (val_history[-4] - val_history[-5]) < 0.001:
            print('pruned')
            return loss_history, train_history, val_history, best_model_name
        
    return loss_history, train_history, val_history, best_model_name
        
    
def compute_accuracy(model, loader, device):
    """
    Computes accuracy on the dataset wrapped in a loader    
    Returns: accuracy as a float value between 0 and 1
    """
    model.eval()
    correct_samples = 0
    total_samples = 0 
    for i_step, (id, visual_embeds, text, y) in enumerate(loader):
        y = y.to(device)
        visual_embeds = visual_embeds.to(device)    
        tokens = tokenizer(list(text), padding='max_length', max_length=77)

        input_ids = torch.tensor(tokens["input_ids"], device=device)
        attention_mask = torch.tensor(tokens["attention_mask"], device=device)
        token_type_ids = torch.tensor(tokens["token_type_ids"], device=device)

        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)


        outputs = model(input_ids=input_ids, 
                          attention_mask=attention_mask, 
                          token_type_ids=token_type_ids, 
                          visual_embeds=visual_embeds, 
                          visual_attention_mask=visual_attention_mask, 
                          visual_token_type_ids=visual_token_type_ids
                      )
    
        prediction = outputs.prediction_logits.sum(axis=1)
            
        _, indices = torch.max(prediction, 1)
        correct_samples += torch.sum(indices == y)
        total_samples += y.shape[0]            

    val_accuracy = float(correct_samples) / total_samples
         
    return val_accuracy

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
features_train_dataset = FeaturesDataset(visual_embeddings_train, train_dict)
features_val_dataset = FeaturesDataset(visual_embeddings_val, val_dict)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [10]:
loss = torch.nn.CrossEntropyLoss()

In [None]:
#Value: 0.5431559571619813
#Parameters: {'layer_count': 207, 'step_size': 4, 'batch_size': 544, 'learning_rate': 0.009455928480805944, 'gamma': 0.5861260817780743}

In [11]:
layer_count=207
step_size=4
batch_size=544
learning_rate=0.009455928480805944
gamma=0.5861260817780743

model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre')
for i, param in enumerate(model.parameters()):
    param.requires_grad = False
    if i > layer_count:
        break

model.cls.predictions.decoder = torch.nn.Linear(in_features=768, out_features=2, bias=True)
model = model.to(device)


params = []
for name, param in model.named_parameters():
    if param.requires_grad == True:
        params.append(param)        

for i, (name, param) in enumerate(model.named_parameters()):
    if param.requires_grad == True:
        print(i, name)


208 cls.predictions.decoder.weight
209 cls.predictions.decoder.bias
211 cls.seq_relationship.bias
Epoch: 0; 34.74 sec; lr: 0.009456; Average loss: 257.74, Train accuracy: 0.5591, Val accuracy: 0.5040
Epoch: 1; 68.05 sec; lr: 0.009456; Average loss: 124.25, Train accuracy: 0.5402, Val accuracy: 0.5000
Epoch: 2; 101.40 sec; lr: 0.009456; Average loss: 84.75, Train accuracy: 0.5616, Val accuracy: 0.5382
Epoch: 3; 134.80 sec; lr: 0.005542; Average loss: 88.15, Train accuracy: 0.5737, Val accuracy: 0.5040
Epoch: 4; 168.34 sec; lr: 0.005542; Average loss: 72.43, Train accuracy: 0.5534, Val accuracy: 0.5141
False True             False True
Epoch: 5; 201.82 sec; lr: 0.005542; Average loss: 27.53, Train accuracy: 0.6016, Val accuracy: 0.5281
False False             True False
Epoch: 6; 235.40 sec; lr: 0.005542; Average loss: 16.95, Train accuracy: 0.6098, Val accuracy: 0.5100
True False             False True
Epoch: 7; 268.97 sec; lr: 0.003249; Average loss: 42.00, Train accuracy: 0.5514, Val 

In [17]:
model.eval()
for i_step, (id, visual_embeds, text, y) in enumerate(DataLoader(features_val_dataset, batch_size=3)):
    visual_embeds = visual_embeds.to(device)    
    tokens = tokenizer(list(text), padding='max_length', max_length=77)

    input_ids = torch.tensor(tokens["input_ids"], device=device)
    attention_mask = torch.tensor(tokens["attention_mask"], device=device)
    token_type_ids = torch.tensor(tokens["token_type_ids"], device=device)

    visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)
    visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long, device=device)


    outputs = model(input_ids=input_ids, 
                      attention_mask=attention_mask, 
                      token_type_ids=token_type_ids, 
                      visual_embeds=visual_embeds, 
                      visual_attention_mask=visual_attention_mask, 
                      visual_token_type_ids=visual_token_type_ids
                  )

    prediction = outputs.prediction_logits
    break

In [19]:
prediction.shape

torch.Size([3, 177, 2])

In [21]:
features_val_dataset[0][1].shape

torch.Size([100, 1024])

In [None]:
fig = plt.figure(figsize=(10, 8))    
plt.xlabel("#iteration")
plt.ylabel("loss")
plt.plot(loss_history, label='loss')
plt.plot(train_history, label='train accuracy')
plt.plot(val_history, label='val accuracy')
fig.legend()
plt.show()

In [None]:
print("best model:", best_model_name)

best_model = torch.load(open(best_model_name, 'rb'))
print(best_model)

In [None]:
best_model.eval()
for i_step, (x, y) in enumerate(DataLoader(features_val_dataset, batch_size=5000)):
    prediction = best_model(x)

acc_score = accuracy_score(np.array([x.item() for x in labels_val]), torch.max(prediction, 1)[1])
auc_score = roc_auc_score(np.array([x.item() for x in labels_val]), prediction[:,1].detach().numpy())

fpr, tpr, thresh = roc_curve(labels_val, prediction[:,1].detach().numpy(), pos_label=1)

random_probs = [0 for i in range(len(labels_val))]
p_fpr, p_tpr, _ = roc_curve(labels_val, random_probs, pos_label=1)
auc_score = roc_auc_score(labels_val, prediction[:,1].detach().numpy())

print('Accuracy: ', acc_score, '\n', 'ROC AUC: ', auc_score, sep='')

plt.plot(fpr, tpr, linestyle='--',color='orange')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
plt.title('ROC Curve', fontsize=20)
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive rate',fontsize=18)

plt.show();