## Load the dataset


In [1]:
from VQA_Datasetv2 import VQA_Dataset, VQA_Dataset_preloaded, VQA_Dataset_Sentences
import clip
import torch
from torch.utils.data import DataLoader
from torch.utils.data import random_split

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model, preprocess = clip.load("ViT-B/32", device=device)
print(preprocess.__class__.__name__)
model.to(torch.float32)
print("clip model loaded")

Using cuda device
Compose
clip model loaded


In [None]:
print(model)

## Choose one of the 3:

### 1.Normal loading

In [None]:
loading = 1

if loading < 3:
    dataset = VQA_Dataset_Sentences()
    
if loading == 0:
    dataset.load_preprocess(preprocess, device, length=30000)
elif loading == 1:
    dataset.load(preprocess, device, length=4)
elif loading == 2:
    dataset.load_encode(preprocess, device, model, length=100)
    dataset.save("")
elif loading == 3:
    dataset.load_saved("")

In [3]:
dataset_train = VQA_Dataset_Sentences()
dataset_val = VQA_Dataset_Sentences()
dataset_train.load(preprocess, device, length=228841)
dataset_val.load(preprocess, device, name="val", length=37501)

JSONDecodeError: Expecting ',' delimiter: line 37502 column 1 (char 31333498)

### 2.Precalc

In [14]:
dataset_train = VQA_Dataset_preloaded()
dataset_val = VQA_Dataset_preloaded()
dataset_train.compute_store(preprocess, model, device, "scale_10000_", length=10000, mode="scale")# length=248348)
dataset_val.compute_store(preprocess, model, device, "scale_10000_", name="val", length=10000, mode="scale") # length=121511 )

Preprocessing Images: 100%|██████████| 10000/10000 [12:30<00:00, 13.32it/s]
Preprocessing Images: 100%|██████████| 10000/10000 [09:52<00:00, 16.87it/s]


### 3.Only load

In [2]:
dataset_train = VQA_Dataset_preloaded()
dataset_val = VQA_Dataset_preloaded()
dataset_train.load("full_", device, length=32755)# 248348)
dataset_val.load("full_", device, name="val", length=37501)#121511)

### Create Dataloader

## New with both Validation and Training Datasets from MSCOCO

In [3]:
batch_size=2

train_size = int(len(dataset_train))
val_size = int(len(dataset_val)*0.3)
test_size = int(len(dataset_val))-val_size
print("Train size: ", train_size)
print("Test size: ", test_size)
print("Val size: ", val_size)
generator = torch.Generator().manual_seed(42)
test_dataset, val_dataset = random_split(dataset_val, [test_size, val_size], generator=generator)

train_dataloader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

Train size:  32755
Test size:  26251
Val size:  11250


### Old

In [3]:
batch_size=2

train_val_size = int(len(dataset)*0.8)
train_size = int(len(dataset)*0.8*0.8)
val_size = int(len(dataset)*0.8)-train_size
test_size = int(len(dataset))-train_val_size
print("Train size: ", train_size)
print("Test size: ", test_size)
print("Val size: ", val_size)
generator = torch.Generator().manual_seed(42)
train_dataset, test_dataset, val_dataset = random_split(dataset, [train_size, test_size, val_size], generator=generator)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

Train size:  19200
Test size:  6000
Val size:  4800


## Simple model architectue


In [4]:
from models import VQA_Model3, VQA_Model2, VQA_Model_Precalc, VQA_Model4, VQA_Model_Blip, VQA_Model_Precalc_Zero, VQA_Model_Precalc_Text, VQA_Model_classify, VQA_Model_classify_v2, VQA_Model_classify_v3, VQA_Model4_Precalc



## Evaluate

In [5]:
import tqdm

def evaluate(model, dataloader, device, test_size, show_progress=False, score_each=False):
    model.eval()
    correct = 0
    if show_progress:
        pbar = tqdm.tqdm(dataloader)
    else:
        pbar = dataloader
    for i, data in enumerate(pbar):
        image = data[0].to(device)
        answer_tokens = data[1].squeeze(0).to(device)
        if dataloader.batch_size == 1:
            correct_answer = torch.tensor([int(data[3])]).to(device)
            question_tokens = data[2].squeeze(0).to(device)
        else:
            correct_answer = torch.tensor([int(x) for x in data[3]]).to(device)
            question_tokens = data[2].squeeze(1).to(device)

        with torch.no_grad():
            if score_each:
                # get similarity for each answer, not for each pair of batch
                if len(answer_tokens.shape) == 2:
                    answer_tokens = answer_tokens.unsqueeze(0)
                similarity = torch.zeros((answer_tokens.shape[0], answer_tokens.shape[1])).to(device)
                
            
                for answer in range(answer_tokens.shape[1]):
                    
                    similarity[:,answer] = model(image, question_tokens, answer_tokens[:, answer]).squeeze(1)
                    
                    
                pred = similarity.argmax(dim=-1)
                
                # get sum of correct answers
                correct += (pred == correct_answer).sum().item()

            else:
                similarity = model(image, question_tokens, answer_tokens)
                pred = similarity.argmax(dim=-1)
                # get sum of correct answers
                correct += (pred == correct_answer).sum().item()
    print(f"Correct: {correct}, Total: {test_size}")
    return correct/(test_size)


In [6]:
combined_model = VQA_Model_Precalc_Text(model, device)
evaluate(combined_model, test_dataloader, device, test_size, show_progress=True)

100%|██████████| 42529/42529 [00:50<00:00, 846.64it/s]

Correct: 13649, Total: 85058





0.16046697547555785

## Training

In [13]:

from torch.optim import lr_scheduler

def train(model, train_dataloader, val_dataloader, device, test_size, epochs=10, patience=4, precalculated=False, save_name="model", loss_fn = torch.nn.CrossEntropyLoss()):
    print(device)
    model.train()
    print(model.parameters())
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)  # Adjust the step_size and gamma accordingly


    
    prev_acc = 0.0
    best_acc = 0.0
    count = 0
    for epoch in range(epochs):
        inner_bar = tqdm.tqdm(train_dataloader, desc='Batch', colour='green')
        
        for data in inner_bar:
            
            image = data[0].to(device)
            answer_tokens = data[1].squeeze(0).to(device)
            if train_dataloader.batch_size == 1:
                correct_answer = torch.tensor(int(data[3])).to(device)
                question_tokens = data[2].squeeze(0).to(device)
            else:
                correct_answer = torch.tensor([int(x) for x in data[3]]).to(device)
                question_tokens = data[2].squeeze(1).to(device)
            
            if loss_fn._get_name() == 'CrossEntropyLoss':
                similarity = model(image, question_tokens, answer_tokens)
                loss = loss_fn(similarity, correct_answer)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=4.0, error_if_nonfinite=True)
                optimizer.step()
            else:
                for answer in range(answer_tokens.shape[1]):
                    similarity = model(image, question_tokens, answer_tokens[:,answer]).squeeze(1)
                    # create 0 or 1 for batch_size
                    correct_similarity = torch.zeros(similarity.shape[0]).to(device)
                    correct_similarity[correct_answer == answer] = 1
                    loss = loss_fn(similarity, correct_similarity)
                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=4.0, error_if_nonfinite=True)
                    optimizer.step()
                    #print("--------------------------------------------------")
            # transform asnwer to tensor of the same shape as similarity before only correct index
            #one_hot_encoding = torch.zeros(similarity.shape[1])
            #one_hot_encoding[correct_answer] = 1
            #one_hot_encoding = one_hot_encoding.to(device)
            
            
            
            #print("--------------------------------------------------")
        scheduler.step()
        if loss_fn._get_name() == 'CrossEntropyLoss':
            acc = evaluate(model, val_dataloader, device,test_size)
        else:
            acc = evaluate(model, val_dataloader, device,test_size, score_each=True)
        if acc > best_acc:
            best_acc = acc
            # save model
            model.save(save_name)
        if acc <= prev_acc:
            count += 1
        else:
            count = 0
        prev_acc = acc
        print(f"Epoch {epoch} loss: {loss.item()}, acc: {acc}, patience: {count}")
        if count == patience:
            print("early stopping")
            break 
        inner_bar.close()

In [14]:
import time

trained_model = VQA_Model4_Precalc(model, device)
# freeze the clip model
for param in trained_model.model.parameters():
    param.requires_grad = False


evaluate(trained_model, test_dataloader, device, test_size, show_progress=True)
train(trained_model, train_dataloader, val_dataloader, device, val_size, epochs=400, save_name="model_4_sentences_prec_adaptiveLr")
# evaluate the model
evaluate(trained_model, test_dataloader, device, test_size)

  0%|          | 0/13126 [00:00<?, ?it/s]

100%|██████████| 13126/13126 [00:15<00:00, 822.31it/s]


Correct: 2505, Total: 26251
cuda
<generator object Module.parameters at 0x000002612F391E70>


Batch: 100%|[32m██████████[0m| 16378/16378 [00:51<00:00, 317.50it/s]


Correct: 3752, Total: 11250
Epoch 0 loss: 0.8944920897483826, acc: 0.3335111111111111, patience: 0


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 310.60it/s]


Correct: 4062, Total: 11250
Epoch 1 loss: 2.233835220336914, acc: 0.36106666666666665, patience: 0


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 313.33it/s]


Correct: 4271, Total: 11250
Epoch 2 loss: 0.27676695585250854, acc: 0.37964444444444445, patience: 0


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 311.26it/s]


Correct: 4371, Total: 11250
Epoch 3 loss: 1.6076503992080688, acc: 0.38853333333333334, patience: 0


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 312.10it/s]


Correct: 4383, Total: 11250
Epoch 4 loss: 3.1190197467803955, acc: 0.3896, patience: 0


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 314.20it/s]


Correct: 4399, Total: 11250
Epoch 5 loss: 0.15594035387039185, acc: 0.3910222222222222, patience: 0


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 313.07it/s]


Correct: 4393, Total: 11250
Epoch 6 loss: 1.806797742843628, acc: 0.3904888888888889, patience: 1


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 313.45it/s]


Correct: 4394, Total: 11250
Epoch 7 loss: 1.5958313941955566, acc: 0.3905777777777778, patience: 0


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 312.72it/s]


Correct: 4393, Total: 11250
Epoch 8 loss: 1.0123833417892456, acc: 0.3904888888888889, patience: 1


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 313.59it/s]


Correct: 4395, Total: 11250
Epoch 9 loss: 1.004792332649231, acc: 0.39066666666666666, patience: 0


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 311.98it/s]


Correct: 4395, Total: 11250
Epoch 10 loss: 2.1819546222686768, acc: 0.39066666666666666, patience: 1


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 313.59it/s]


Correct: 4395, Total: 11250
Epoch 11 loss: 0.5126448273658752, acc: 0.39066666666666666, patience: 2


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 313.33it/s]


Correct: 4395, Total: 11250
Epoch 12 loss: 8.752946853637695, acc: 0.39066666666666666, patience: 3


Batch: 100%|[32m██████████[0m| 16378/16378 [00:52<00:00, 313.36it/s]


Correct: 4395, Total: 11250
Epoch 13 loss: 0.9845958352088928, acc: 0.39066666666666666, patience: 4
early stopping
Correct: 10225, Total: 26251


0.38950897108681576

## Save the model!

In [None]:
# save the model
trained_model.save("trained_model_")

## Load and evaluate


In [8]:
#load and evaluate the model
from models import VQA_Model4

trained_model = VQA_Model_Precalc(model, device) # need to choose the same model!
trained_model.load("best_model_")
#print(trained_model)
evaluate(trained_model, test_dataloader, device, test_size, show_progress=True)

100%|██████████| 3000/3000 [00:43<00:00, 68.31it/s]

Correct: 2809, Total: 6000





0.4681666666666667

In [None]:
"""
import clip
from PIL import Image
import torch

for model_name in clip.available_models():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using {device} device and model: {model_name}")
    model, preprocess = clip.load(model_name, device=device)
    model.to(torch.float32)
    text_tokens = clip.tokenize(["question_text"]).to(device)
    image = preprocess(Image.open("CLIP.jpg")).unsqueeze(0).to(device)
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_tokens)
    print(f"text_t {text_tokens.dtype}, text_f {text_features.dtype}, image_t {image.dtype}, image_f {image_features.dtype}")
    """


'\nimport clip\nfrom PIL import Image\nimport torch\n\nfor model_name in clip.available_models():\n    device = "cuda" if torch.cuda.is_available() else "cpu"\n    print(f"Using {device} device and model: {model_name}")\n    model, preprocess = clip.load(model_name, device=device)\n    model.to(torch.float32)\n    text_tokens = clip.tokenize(["question_text"]).to(device)\n    image = preprocess(Image.open("CLIP.jpg")).unsqueeze(0).to(device)\n    image_features = model.encode_image(image)\n    text_features = model.encode_text(text_tokens)\n    print(f"text_t {text_tokens.dtype}, text_f {text_features.dtype}, image_t {image.dtype}, image_f {image_features.dtype}")\n    '