In [1]:
import torch
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

**Dataset and Dataloader**

In [2]:
import sys
from torch.utils.data import random_split
sys.path.append('../')  # Adjust the path accordingly
from VQA_Dataset_BLIP import VQA_Dataset

dataset = VQA_Dataset()
dataset.load_all(preprocess=None, length=4000, device=device)

Preprocessing Images:  97%|█████████▋| 292/300 [00:03<00:00, 112.08it/s]

In [3]:
dataset[0][3]

'yes'

In [4]:
#Create pytorch dataset
from torch.utils.data import Dataset, DataLoader

class VQADataset(Dataset):
    def __init__(self, dataset, text_processor, image_processor):
        self.dataset = dataset
        self.text_processor = text_processor
        self.image_processor = image_processor
        self.max_length = 32
        self.image_height = 256
        self.image_width = 256

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):   
        # get image + text
        answers = self.dataset[idx][3]
        questions = self.dataset[idx][1]
        image = self.dataset[idx][0].convert('RGB')

        image_encoding = self.image_processor(image,
                                  do_resize=True,
                                  size=(self.image_height,self.image_width),
                                  return_tensors="pt")

        encoding = self.text_processor(
                                  None,
                                  questions,
                                  padding="max_length",
                                  truncation=True,
                                  max_length = self.max_length,
                                  return_tensors="pt"
                                  )
        # # remove batch dimension
        for k,v in encoding.items():
            encoding[k] = v.squeeze()
        encoding["pixel_values"] = image_encoding["pixel_values"][0]
        # # add labels
        labels = self.text_processor.tokenizer.encode(
            answers,
            max_length= self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors='pt'
        )[0]
        encoding["labels"] = labels

        return encoding

In [5]:
#Load processor
from PIL import Image
import requests
from transformers import BlipProcessor, BlipImageProcessor, BlipForQuestionAnswering

text_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
image_processor = BlipImageProcessor.from_pretrained("Salesforce/blip-vqa-base")

In [6]:
#Pytorch dataset creation
dataset_torch = VQADataset(dataset, text_processor, image_processor)

train_size = int(len(dataset_torch)*0.8)
val_size = int(len(dataset_torch)*0.1)
test_size = int(len(dataset_torch))-train_size-val_size
generator = torch.Generator().manual_seed(1)
train_dataset, val_dataset, test_dataset = random_split(dataset_torch, [train_size, val_size, test_size], generator=generator)
print("Train size: ", train_size)
print("Test size: ", test_size)

train_dataset[0]

Train size:  240
Test size:  30


{'input_ids': tensor([ 101, 2515, 2009, 4025, 2066, 2016, 2003, 3403, 2005, 2060, 2111, 2000,
        2272, 1029,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'pixel_values': tensor([[[ 1.3172,  1.3172,  1.3172,  ...,  1.2880,  0.5143,  1.2442],
         [ 1.3172,  1.3172,  1.3172,  ...,  0.7041,  0.4267,  1.3610],
         [ 1.3172,  1.3172,  1.3172,  ...,  0.1493,  1.2150,  1.3318],
         ...,
         [-0.8288, -0.8288, -0.8288,  ..., -0.3470, -0.3470, -0.3324],
         [-0.8288, -0.8288, -0.8288,  ..., -0.5368, -0.5660, -0.5660],
         [-0.8288, -0.8288, -0.8288,  ..., -1.4711, -1.4711, -1.6025]],

        [[ 2.0149,  2.0149,  2.0149,  ...,  1.9698,  1.0093,  1.9248],
         [ 2.0149,  2.0149,  2.0149,  ...,  1.2645,  0.9343,  2.0449],
         [ 2.0149,  2.0149,  2.0149,

In [7]:
#Dataloader
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    pixel_values = [item['pixel_values'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]
    # create new batch
    batch = {}
    batch['input_ids'] = torch.stack(input_ids)
    batch['attention_mask'] = torch.stack(attention_mask)
    batch['pixel_values'] = torch.stack(pixel_values)
    batch['labels'] = torch.stack(labels)

    return batch

batch_size = 2
train_dataloader = DataLoader(train_dataset,
                              collate_fn=collate_fn,
                              batch_size=batch_size,
                              shuffle=False)
val_dataloader = DataLoader(val_dataset,
                            collate_fn=collate_fn,
                            batch_size=batch_size,
                            shuffle=False)

In [8]:
#Testing
import numpy as np
import matplotlib.pyplot as plt

batch = next(iter(train_dataloader))
for k,v in batch.items():
    print(k, v.shape)

# batch_idx = 1
# image_mean = image_processor.image_mean
# image_std = image_processor.image_std
# unnormalized_image = (batch["pixel_values"][batch_idx].cpu().numpy() * np.array(image_std)[:, None, None]) + np.array(image_mean)[:, None, None]
# unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
# unnormalized_image = (unnormalized_image * 255).astype(np.uint8)

# print("Question: ",text_processor.decode(batch["input_ids"][batch_idx]))
# print("Answer: ",text_processor.decode(batch["labels"][batch_idx]))
# plt.imshow(Image.fromarray(unnormalized_image))

input_ids torch.Size([32, 32])
attention_mask torch.Size([32, 32])
pixel_values torch.Size([32, 3, 256, 256])
labels torch.Size([32, 32])


**Train the model**

In [9]:
from torch.utils.tensorboard import SummaryWriter
import os 

#Tensorboard
currentModelIteration = "5e-5_4k"
folder_path = os.path.join("runs", "trainingsBLIP", currentModelIteration)
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
writer = SummaryWriter(folder_path)

In [10]:
def train(dataloader, vqa_model, optimizer, clip_value, epoch):
    size = len(dataloader.dataset)
    vqa_model.train()
    train_correct_num, train_total, train_cost_acum = 0, 0, 0.0

    for batch_i, batch in enumerate(dataloader):    
        # get the inputs;
        batch = {k:v.to(device) for k,v in batch.items()}

        optimizer.zero_grad()      
        outputs = vqa_model(**batch)
        #predicted_answers = text_processor.batch_decode(outputs[0], skip_special_tokens=True)
        #correct_answers = text_processor.batch_decode(batch["labels"][0], skip_special_tokens=True)

        #Backpropagation
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(vqa_model.parameters(), max_norm=clip_value, error_if_nonfinite=True)
        optimizer.step()
        
        #Ploting results
        train_cost_acum += loss
        #train_correct_num += sum(pred_answer == correct_answer for pred_answer, correct_answer in zip(predicted_answers, correct_answers))
        #train_total += len(correct_answers)

        if batch_i % 50 == 1:
            writer.add_scalar('Loss/training', float(train_cost_acum)/batch_i, epoch * size + batch_i)
            #writer.add_scalar('Accuracy/training', float(train_correct_num)/train_total, epoch * size + batch_i)
            loss, current = loss.item(), batch_i*len(batch_size)
            print("loss: ", loss, current, size)

In [11]:
def eval(dataloader, model, epoch):
    size = len(dataloader.dataset)
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch in dataloader:    
            batch = {k:v.to(device) for k,v in batch.items()}

            outputs = model.generate(**batch)
            predicted_answers = text_processor.batch_decode(outputs,skip_special_tokens=True)
            correct_answers = text_processor.batch_decode(batch['labels'], skip_special_tokens=True)
            correct += sum(pred_answer == correct_answer for pred_answer, correct_answer in zip(predicted_answers, correct_answers))   

            #Debugging
            # print("Question: ", text_processor.batch_decode(batch["input_ids"], skip_special_tokens=True)[0])      
            # print("Answer: ", text_processor.batch_decode(batch["labels"], skip_special_tokens=True)[0])  
            # print("Predicted: ", text_processor.batch_decode(outputs, skip_special_tokens=True)[0])
                        
    correct /= size

    #Ploting results
    writer.add_scalar('Accuracy/test', correct*100, epoch)
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%")
    return 100*correct

In [12]:
#Hyperparameters and optim
import torch

clip_value = 1.0
learning_rate = 5e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
device = "cpu" if torch.cuda.is_available() else "cpu"
model.to(device)

BlipForQuestionAnswering(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-05, e

In [13]:
#Early stopping parameters
n_epochs = 50
early_stop_threshhold = 5
best_accuracy = -1
best_epoch = -1

def checkpoint(model, filename):
    folder_path = os.path.join("runs", "best_model", currentModelIteration)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    torch.save(model.state_dict(), os.path.join(folder_path, filename))
    
def resume(model, filename):
    model.load_state_dict(torch.load(os.path.join("runs", "checkpoint_SolvingCropping", filename)))

In [14]:
for epoch in range(n_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train(train_dataloader, model, optimizer, clip_value, epoch)
    acc = eval(val_dataloader, model, epoch)
    if acc>best_accuracy:
        best_accuracy = acc
        best_epoch = epoch
        checkpoint(model, "best_model.pth")
    elif (epoch-best_epoch) > early_stop_threshhold:
        print("--Early stopped training--")
        break

Epoch 1
-------------------------------


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


: 