## Load the dataset


In [1]:
from VQA_Datasetv2 import VQA_Dataset
import clip
import torch
from torch.utils.data import DataLoader
from torch.utils.data import random_split

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model, preprocess = clip.load("ViT-B/32", device=device)
model.to(torch.float32)
dataset = VQA_Dataset()
dataset.load_all(preprocess, device, length=6000)
train_val_size = int(len(dataset)*0.8)
train_size = int(len(dataset)*0.8*0.8)
val_size = int(len(dataset)*0.8)-train_size
test_size = int(len(dataset))-train_val_size
print("Train size: ", train_size)
print("Test size: ", test_size)
print("Val size: ", val_size)
train_dataset, test_dataset, val_dataset = random_split(dataset, [train_size, test_size, val_size])

batch_size=32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

Using cuda device


Preprocessing Images: 100%|██████████| 6000/6000 [01:17<00:00, 77.87it/s]


Train size:  3840
Test size:  1200
Val size:  960


## Simple model architectue


In [2]:
from models import VQA_Model2



## Evaluate

In [3]:
import tqdm

def evaluate(model, dataloader, device, test_size, show_progress=False):
    model.eval()
    correct = 0
    if show_progress:
        pbar = tqdm.tqdm(dataloader)
    else:
        pbar = dataloader
    for i, data in enumerate(pbar):
        image = data[0].to(device)
        answer_tokens = data[1].squeeze(0).to(device)
        if dataloader.batch_size == 1:
            correct_answer = torch.tensor([int(data[3])]).to(device)
            question_tokens = data[2].squeeze(0).to(device)
        else:
            correct_answer = torch.tensor([int(x) for x in data[3]]).to(device)
            question_tokens = data[2].squeeze(1).to(device)

        with torch.no_grad():
            similarity = model(image, question_tokens, answer_tokens)
            pred = similarity.argmax(dim=-1)
            # get sum of correct answers
            correct += (pred == correct_answer).sum().item()
    print(f"Correct: {correct}, Total: {test_size}")
    return correct/(test_size)


In [4]:
combined_model = VQA_Model2(model, device)
evaluate(combined_model, test_dataloader, device, test_size, show_progress=True)

100%|██████████| 38/38 [00:27<00:00,  1.41it/s]

Correct: 39, Total: 1200





0.0325

## Training

In [5]:
def train(model, train_dataloader, val_dataloader, device, test_size, epochs=10, patience=3):
    print(device)
    model.train()
    print(model.parameters())
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    loss_fn = torch.nn.CrossEntropyLoss()

    pbar = tqdm.tqdm(range(epochs))
    prev_acc = 0.0
    count = 0
    for epoch in pbar:
        for i, data in enumerate(train_dataloader):

            image = data[0].to(device)
            answer_tokens = data[1].squeeze(0).to(device)
            if train_dataloader.batch_size == 1:
                correct_answer = torch.tensor(int(data[3])).to(device)
                question_tokens = data[2].squeeze(0).to(device)
            else:
                correct_answer = torch.tensor([int(x) for x in data[3]]).to(device)
                question_tokens = data[2].squeeze(1).to(device)
            
            optimizer.zero_grad()
            similarity = model(image, question_tokens, answer_tokens)

            # transform asnwer to tensor of the same shape as similarity before only correct index
            #one_hot_encoding = torch.zeros(similarity.shape[1])
            #one_hot_encoding[correct_answer] = 1
            #one_hot_encoding = one_hot_encoding.to(device)
            

            loss = loss_fn(similarity, correct_answer)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=4.0, error_if_nonfinite=True)
            optimizer.step()
            #print("--------------------------------------------------")
        acc = evaluate(model, val_dataloader, device,test_size)
        if acc < prev_acc:
            count += 1
        prev_acc = acc
        pbar.set_description(f"Epoch {epoch} loss: {loss.item()}, acc: {acc}")
        if count == patience:
            print("early stopping")
            break 

In [6]:
import time

trained_model = VQA_Model2(model, device)
# freeze the clip model
for param in trained_model.model.parameters():
    param.requires_grad = False


evaluate(trained_model, test_dataloader, device, test_size)
train(trained_model, train_dataloader, val_dataloader, device, test_size, epochs=200)
# evaluate the model
evaluate(trained_model, test_dataloader, device, test_size)

Correct: 66, Total: 1200
cuda
<generator object Module.parameters at 0x000002094CF9D1C0>


  0%|          | 0/200 [01:20<?, ?it/s]


RuntimeError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# save the model
torch.save(trained_model.state_dict(), "trained_model.pt")

In [None]:
"""
import clip
from PIL import Image
import torch

for model_name in clip.available_models():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using {device} device and model: {model_name}")
    model, preprocess = clip.load(model_name, device=device)
    model.to(torch.float32)
    text_tokens = clip.tokenize(["question_text"]).to(device)
    image = preprocess(Image.open("CLIP.jpg")).unsqueeze(0).to(device)
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_tokens)
    print(f"text_t {text_tokens.dtype}, text_f {text_features.dtype}, image_t {image.dtype}, image_f {image_features.dtype}")
    """


'\nimport clip\nfrom PIL import Image\nimport torch\n\nfor model_name in clip.available_models():\n    device = "cuda" if torch.cuda.is_available() else "cpu"\n    print(f"Using {device} device and model: {model_name}")\n    model, preprocess = clip.load(model_name, device=device)\n    model.to(torch.float32)\n    text_tokens = clip.tokenize(["question_text"]).to(device)\n    image = preprocess(Image.open("CLIP.jpg")).unsqueeze(0).to(device)\n    image_features = model.encode_image(image)\n    text_features = model.encode_text(text_tokens)\n    print(f"text_t {text_tokens.dtype}, text_f {text_features.dtype}, image_t {image.dtype}, image_f {image_features.dtype}")\n    '