In [2]:
import datasets
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
from huggingface_hub import login

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import StepLR

import math
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score

from tqdm import tqdm
import matplotlib.pyplot as plt
import os

from dataclasses import dataclass

In [3]:
login("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")

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
dataset = datasets.load_dataset("Gapes21/vqa2", split = "train")

Downloading readme:   0%|          | 0.00/361 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 487M/487M [00:02<00:00, 220MB/s]  
Downloading data: 100%|██████████| 490M/490M [00:09<00:00, 50.1MB/s] 
Downloading data: 100%|██████████| 487M/487M [00:02<00:00, 220MB/s]  
Downloading data: 100%|██████████| 490M/490M [00:02<00:00, 238MB/s]  
Downloading data: 100%|██████████| 486M/486M [00:06<00:00, 78.1MB/s] 
Downloading data: 100%|██████████| 490M/490M [00:06<00:00, 79.7MB/s] 
Downloading data: 100%|██████████| 485M/485M [00:06<00:00, 75.8MB/s] 
Downloading data: 100%|██████████| 485M/485M [00:02<00:00, 233MB/s]  
Downloading data: 100%|██████████| 487M/487M [00:02<00:00, 232MB/s]  
Downloading data: 100%|██████████| 490M/490M [00:02<00:00, 221MB/s]  
Downloading data: 100%|██████████| 490M/490M [00:02<00:00, 200MB/s]  


Generating train split:   0%|          | 0/109485 [00:00<?, ? examples/s]

In [6]:
labelEncoder = LabelEncoder()
labelEncoder.fit(dataset['answer'])

In [7]:
labelMap = {}
for i, label in enumerate(labelEncoder.classes_):
    labelMap[label] = i
class_counts = [0 for i in range(len(labelMap))]
for label in tqdm(dataset["answer"]):
    labelid = labelMap[label]
    class_counts[labelid] += 1
total_samples = sum(class_counts)
class_weights = [total_samples / (len(class_counts) * count) for count in class_counts]

100%|██████████| 109485/109485 [00:00<00:00, 1014112.30it/s]


In [8]:
BERT = "smallbenchnlp/roberta-small"
VIT = 'facebook/dinov2-small'

In [9]:
processor = AutoImageProcessor.from_pretrained(VIT)
tokenizer = AutoTokenizer.from_pretrained(BERT)

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [10]:
class SastaLoader:
    def __init__(self, dataset, batch_size, collator_fn, train_max = 100000, mode = "train"):
        self.dataset = dataset.shuffle()
        self.collator_fn = collator_fn
        self.len = len(self.dataset)
        self.batch_size = batch_size
        if mode == "train":
            self.index = 0
        else :
            self.index = train_max
        self.train_max = train_max
        self.mode = mode

    def hasNext(self):
        if self.mode == "train":
            return self.index + self.batch_size <= self.train_max
        else :
            return self.index + self.batch.size <= self.len
    
    def reset(self):
        if self.mode == "train":
            self.index = 0
        else:
            self.index = self.train_max
        
    def __iter__(self):
        return self

    def __next__(self):
        if self.mode == "train":
            if self.index >= self.train_max:
                raise StopIteration
        else :
            if self.index >= self.len:
                raise StopIteration
                
        batch = self.dataset[self.index: self.index + self.batch_size]
        batch = self.collator_fn(batch)
        self.index += self.batch_size
        return batch
    
    def __len__(self):
        if self.mode == "train":
            return self.train_max
        return self.len - self.train_max
    
    def train(self):
        self.mode = "train"
        
    def validate(self):
        self.mode = "validation"

In [11]:
def sasta_collator(batch):
    # process images
    images = processor(images = batch['image'], return_tensors="pt")['pixel_values']

    # preprocess questions
    questions = tokenizer(
            text=batch['question'],
            padding='longest',
            max_length=24,
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True,
        )

    # process labels
    labels = torch.Tensor(labelEncoder.transform(batch['answer']))
    return (images, questions, labels)


In [12]:
class DinoBertSmall(nn.Module):
    def __init__(
        self,
        num_labels,
        intermediate_dim,
        pretrained_text_name,
        pretrained_image_name,
        classifier_dim = 9024,
    ):
        super(DinoBertSmall, self).__init__()
        
        self.num_labels = num_labels
        self.intermediate_dim = intermediate_dim
        self.pretrained_text_name = pretrained_text_name
        self.pretrained_image_name = pretrained_image_name
        self.classifier_dim = classifier_dim
        
        # Text and image encoders
        
        self.text_encoder = AutoModel.from_pretrained(self.pretrained_text_name)
        self.image_encoder = AutoModel.from_pretrained(self.pretrained_image_name)

#         assert(self.text_encoder.config.hidden_size == self.image_encoder.config.hidden_size)

        self.embedd_dim_text = self.text_encoder.config.hidden_size
        self.embedd_dim_img = self.image_encoder.config.hidden_size

        print(self.embedd_dim_text, self.embedd_dim_img)

        # Classifier
        self.initdim = self.embedd_dim_img + self.embedd_dim_text
        self.classifier = nn.Sequential(
            nn.Linear(self.initdim, self.num_labels),
            nn.LeakyReLU(), 
            nn.Dropout(p = 0.1),
            nn.Linear(self.num_labels, self.num_labels)
        )

    def forward(
        self,
        input_ids,
        pixel_values,
        attention_mask
    ):
        # Encode text with masking
        encoded_text = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        
        # Encode images
        encoded_image = self.image_encoder(
            pixel_values=pixel_values,
        )
        
        text = encoded_text.last_hidden_state
        img = encoded_image.last_hidden_state
        
        conatt = torch.cat((text[:, 0, :], img[:, 0, :]), dim = 1)
        conatt = conatt.view(conatt.shape[0], -1)
        
        # Make predictions
        logits = self.classifier(conatt)
        return logits

In [22]:
def save_model(model, name):
    torch.save(model.state_dict(), name)

def initVQA():
    model = DinoBertSmall(len(labelEncoder.classes_), 512, BERT, VIT).to(device)
    return model

def load_model(name, backup = initVQA, frommem = True):
    model = backup()
    if frommem == False:
        print("Initializing from scratch.")
        return model
    try : 
        model.load_state_dict(torch.load(f"{name}"))
        print("Loaded model successfully.")
    except:
        print("Couldn't find model. Initializing from scratch.")
    return model

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

### Testing the model and tweaking it

In [14]:
# collator_fn = sasta_collator
# loader = SastaLoader(dataset, 1, sasta_collator)
# model = load_model("vqa_dr.pth", frommem = False)

# ids, pxlvalues, masks, labels = None, None, None, None
# for batchid, batch in enumerate(loader):
#     ids = batch[1]['input_ids'].to(device)
#     pxlvalues = batch[0].to(device)
#     masks = batch[1]['attention_mask'].to(device)
#     labels = batch[2].to(device)
#     break
# len(ids), len(pxlvalues), len(masks), len(labels)

In [15]:
# outputs = model(ids, pxlvalues, masks)

## Training

#### Model, optimizer and loss

In [23]:
model = load_model("/kaggle/input/dinobert-small-models/dinobert_small_fully_trained.pth", frommem = True)
print_trainable_parameters(model)

Some weights of RobertaModel were not initialized from the model checkpoint at smallbenchnlp/roberta-small and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


256 384
Loaded model successfully.
trainable params: 135352192 || all params: 135352192 || trainable%: 100.00


#### Hyperparams

In [None]:
collator_fn = sasta_collator
loader = SastaLoader(dataset, 64, sasta_collator)
num_epochs = 10
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=2, gamma=0.2)

In [None]:
def train(model, optimizer, criterion, scheduler, loader, num_epochs, device):
    loss_plot, accuracy_plot = [], []
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        correct = 0
        total_samples = 0
        with tqdm(total=len(loader), desc="Processing batches", dynamic_ncols=True) as pbar:
            for batchidx, batch in enumerate(loader):
                ids = batch[1]['input_ids'].to(device)
                pxlvalues = batch[0].to(device)
                masks = batch[1]['attention_mask'].to(device)
                labels = batch[2].to(device)

                optimizer.zero_grad()
                outputs = model(ids, pxlvalues, masks)
                loss = criterion(outputs, labels.long())
                loss.backward()
                optimizer.step()

                total_loss += loss.item() * loader.batch_size
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total_samples += labels.size(0)
                pbar.update(loader.batch_size)
                if batchidx % 16000 <= 1:
                    save_model(model, 'vqa_dr.pth')
                
        epoch_loss = total_loss / total_samples
        accuracy = correct / total_samples
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.4f}")
        accuracy_plot.append(accuracy * 100)
        loss_plot.append(epoch_loss)
        save_model(model, "vqa_dr.pth")
        scheduler.step()
        loader.reset()
    plt.plot(accuracy_plot)
    plt.plot(loss_plot)

In [None]:
train(model, optimizer, criterion, scheduler, loader, num_epochs, device)

In [24]:
validation_loader = SastaLoader(dataset, 16, sasta_collator, mode = "validation")

In [25]:
def evaluate_model(model, loader, device):
    y_true, y_pred = [], []
    model.eval()
#     loader.reset()
    with tqdm(total=len(loader), desc="Processing batches", dynamic_ncols=True) as pbar:
        for batchidx, batch in enumerate(loader):
            ids = batch[1]['input_ids'].to(device)
            pxlvalues = batch[0].to(device)
            masks = batch[1]['attention_mask'].to(device)
            labels = batch[2].to("cpu")
            outputs = model(ids, pxlvalues, masks)
            _, predicted = torch.max(outputs, 1)
            predicted = predicted.to("cpu")
            y_true.extend(labels)
            y_pred.extend(predicted)
            pbar.update(loader.batch_size)
    f1 = f1_score(y_true, y_pred, average = "weighted")
    accuracy = accuracy_score(y_true, y_pred)
    print(f"F1-score: {f1 : 0.2f}")
    print(f"Accuracy: {accuracy * 100 : 0.2f}%")
    return y_pred, y_true

In [26]:
y_pred, y_true = evaluate_model(model, validation_loader, device)

Processing batches: 9488it [02:13, 71.17it/s]                          


F1-score:  0.10
Accuracy:  19.14%


In [30]:
# print(classification_report(y_true, y_pred))

In [27]:
label_dict = dict()
for label in y_pred:
    if label.item() in label_dict:
        label_dict[label.item()] += 1
    else:
        label_dict[label.item()] = 1
    
for label in label_dict.keys():
    print(f"{labelEncoder.inverse_transform([label])} : {label_dict[label]}")

['yes'] : 6714
['no'] : 2771
