## Load the dataset


In [1]:
from VQA_Datasetv2 import VQA_Dataset
import clip
import torch
from torch.utils.data import DataLoader
from torch.utils.data import random_split

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model, preprocess = clip.load("ViT-B/32", device=device)
model.to(torch.float32)
dataset = VQA_Dataset()

#dataset.load_preprocess(preprocess, device, length=6000)

dataset.load(preprocess, device, length=30000)

#dataset.load_encode(preprocess, device, model, length=100)
#dataset.save("")
#dataset.load_saved("")

train_val_size = int(len(dataset)*0.8)
train_size = int(len(dataset)*0.8*0.8)
val_size = int(len(dataset)*0.8)-train_size
test_size = int(len(dataset))-train_val_size
print("Train size: ", train_size)
print("Test size: ", test_size)
print("Val size: ", val_size)
generator = torch.Generator().manual_seed(42)
train_dataset, test_dataset, val_dataset = random_split(dataset, [train_size, test_size, val_size], generator=generator)

batch_size=32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

Using cuda device


Loading Images: 100%|██████████| 30000/30000 [00:38<00:00, 782.59it/s]


Train size:  19200
Test size:  6000
Val size:  4800


## Simple model architectue


In [2]:
from models import VQA_Model3, VQA_Model2, VQA_Model_Precalc, VQA_Model4



## Evaluate

In [3]:
import tqdm

def evaluate(model, dataloader, device, test_size, show_progress=False):
    model.eval()
    correct = 0
    if show_progress:
        pbar = tqdm.tqdm(dataloader)
    else:
        pbar = dataloader
    for i, data in enumerate(pbar):
        image = data[0].to(device)
        answer_tokens = data[1].squeeze(0).to(device)
        if dataloader.batch_size == 1:
            correct_answer = torch.tensor([int(data[3])]).to(device)
            question_tokens = data[2].squeeze(0).to(device)
        else:
            correct_answer = torch.tensor([int(x) for x in data[3]]).to(device)
            question_tokens = data[2].squeeze(1).to(device)

        with torch.no_grad():
            similarity = model(image, question_tokens, answer_tokens)
            pred = similarity.argmax(dim=-1)
            # get sum of correct answers
            correct += (pred == correct_answer).sum().item()
    print(f"Correct: {correct}, Total: {test_size}")
    return correct/(test_size)


In [4]:
combined_model = VQA_Model4(model, device)
evaluate(combined_model, test_dataloader, device, test_size, show_progress=True)

  0%|          | 0/188 [00:00<?, ?it/s]

 90%|████████▉ | 169/188 [02:45<00:18,  1.02it/s]


KeyboardInterrupt: 

## Training

In [4]:
def train(model, train_dataloader, val_dataloader, device, test_size, epochs=10, patience=2, precalculated=False):
    print(device)
    model.train()
    print(model.parameters())
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    loss_fn = torch.nn.CrossEntropyLoss()

    pbar = tqdm.tqdm(range(epochs))
    prev_acc = 0.0
    best_acc = 0.0
    count = 0
    for epoch in pbar:
        for i, data in enumerate(train_dataloader):

            image = data[0].to(device)
            answer_tokens = data[1].squeeze(0).to(device)
            if train_dataloader.batch_size == 1:
                correct_answer = torch.tensor(int(data[3])).to(device)
                question_tokens = data[2].squeeze(0).to(device)
            else:
                correct_answer = torch.tensor([int(x) for x in data[3]]).to(device)
                question_tokens = data[2].squeeze(1).to(device)
            
            optimizer.zero_grad()
            similarity = model(image, question_tokens, answer_tokens)

            # transform asnwer to tensor of the same shape as similarity before only correct index
            #one_hot_encoding = torch.zeros(similarity.shape[1])
            #one_hot_encoding[correct_answer] = 1
            #one_hot_encoding = one_hot_encoding.to(device)
            

            loss = loss_fn(similarity, correct_answer)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=4.0, error_if_nonfinite=True)
            optimizer.step()
            #print("--------------------------------------------------")
        acc = evaluate(model, val_dataloader, device,test_size)
        if acc > best_acc:
            best_acc = acc
            # save model
            torch.save(model.state_dict(), "best_model.pt")
        if acc < prev_acc:
            count += 1
        else:
            count = 0
        prev_acc = acc
        pbar.set_description(f"Epoch {epoch} loss: {loss.item()}, acc: {acc}", refresh=True)
        if count == patience:
            print("early stopping")
            break 

In [5]:
import time

trained_model = VQA_Model4(model, device)
# freeze the clip model
for param in trained_model.model.parameters():
    param.requires_grad = False


evaluate(trained_model, test_dataloader, device, test_size)
train(trained_model, train_dataloader, val_dataloader, device, val_size, epochs=200)
# evaluate the model
evaluate(trained_model, test_dataloader, device, test_size)

Correct: 132, Total: 6000
cuda
<generator object Module.parameters at 0x00000200FE7E2CE0>


  0%|          | 0/200 [00:00<?, ?it/s]

Correct: 2227, Total: 4800


Epoch 0 loss: 1.1892313957214355, acc: 0.4639583333333333:   0%|          | 1/200 [11:20<37:35:43, 680.12s/it]

Correct: 2413, Total: 4800


Epoch 1 loss: 1.3080430030822754, acc: 0.5027083333333333:   1%|          | 2/200 [22:38<37:21:30, 679.25s/it]

Correct: 2492, Total: 4800


Epoch 2 loss: 1.3111917972564697, acc: 0.5191666666666667:   2%|▏         | 3/200 [33:57<37:08:47, 678.82s/it]

Correct: 2563, Total: 4800


Epoch 3 loss: 1.0613690614700317, acc: 0.5339583333333333:   2%|▏         | 4/200 [45:15<36:57:18, 678.77s/it]

Correct: 2593, Total: 4800


Epoch 4 loss: 0.9139178395271301, acc: 0.5402083333333333:   2%|▎         | 5/200 [56:35<36:46:34, 678.95s/it]

Correct: 2599, Total: 4800


Epoch 5 loss: 1.2033259868621826, acc: 0.5414583333333334:   3%|▎         | 6/200 [1:07:52<36:33:58, 678.55s/it]

Correct: 2652, Total: 4800


Epoch 6 loss: 0.7361686825752258, acc: 0.5525:   4%|▎         | 7/200 [1:19:10<36:21:57, 678.33s/it]            

Correct: 2716, Total: 4800


Epoch 7 loss: 0.8997942805290222, acc: 0.5658333333333333:   4%|▍         | 8/200 [1:30:28<36:10:16, 678.21s/it]

Correct: 2718, Total: 4800


Epoch 8 loss: 0.7721854448318481, acc: 0.56625:   4%|▍         | 9/200 [1:41:46<35:58:55, 678.20s/it]           

Correct: 2719, Total: 4800


Epoch 9 loss: 0.5771684646606445, acc: 0.5664583333333333:   5%|▌         | 10/200 [1:53:04<35:47:27, 678.15s/it]

Correct: 2734, Total: 4800


Epoch 10 loss: 1.103007197380066, acc: 0.5695833333333333:   6%|▌         | 11/200 [2:04:57<36:09:12, 688.64s/it]

Correct: 2747, Total: 4800


Epoch 12 loss: 0.545444130897522, acc: 0.5654166666666667:   6%|▋         | 13/200 [2:29:46<37:13:21, 716.59s/it] 

Correct: 2714, Total: 4800
Correct: 2751, Total: 4800


Epoch 13 loss: 0.7346630692481995, acc: 0.573125:   7%|▋         | 14/200 [2:41:04<36:25:27, 704.99s/it]         

Correct: 2754, Total: 4800


Epoch 14 loss: 0.5985968708992004, acc: 0.57375:   8%|▊         | 15/200 [2:52:23<35:49:05, 697.00s/it] 

Correct: 2778, Total: 4800


Epoch 16 loss: 0.6445966362953186, acc: 0.575625:   8%|▊         | 17/200 [3:14:59<34:56:43, 687.45s/it]

Correct: 2763, Total: 4800


Epoch 17 loss: 0.5132622718811035, acc: 0.57625:   9%|▉         | 18/200 [3:26:17<34:36:17, 684.49s/it] 

Correct: 2766, Total: 4800


Epoch 18 loss: 0.514853835105896, acc: 0.576875:  10%|▉         | 19/200 [3:37:34<34:18:29, 682.37s/it]

Correct: 2769, Total: 4800


Epoch 19 loss: 0.5648410320281982, acc: 0.5714583333333333:  10%|█         | 20/200 [3:48:52<34:02:44, 680.92s/it]

Correct: 2743, Total: 4800


Epoch 20 loss: 0.3461906313896179, acc: 0.5647916666666667:  10%|█         | 21/200 [4:00:09<33:48:16, 679.87s/it]

Correct: 2711, Total: 4800


Epoch 21 loss: 0.33812612295150757, acc: 0.5720833333333334:  11%|█         | 22/200 [4:11:27<33:35:11, 679.28s/it]

Correct: 2746, Total: 4800


Epoch 22 loss: 0.2942090332508087, acc: 0.575:  12%|█▏        | 23/200 [4:22:46<33:23:16, 679.07s/it]              

Correct: 2760, Total: 4800


Epoch 23 loss: 0.27120259404182434, acc: 0.5775:  12%|█▏        | 24/200 [4:34:04<33:11:14, 678.83s/it]

Correct: 2772, Total: 4800
Correct: 2782, Total: 4800


Epoch 25 loss: 0.2707204222679138, acc: 0.5770833333333333:  13%|█▎        | 26/200 [4:56:41<32:48:06, 678.66s/it]

Correct: 2770, Total: 4800


Epoch 26 loss: 0.16216380894184113, acc: 0.5722916666666666:  14%|█▎        | 27/200 [5:08:00<32:36:40, 678.61s/it]

Correct: 2747, Total: 4800


Epoch 27 loss: 0.24704235792160034, acc: 0.5702083333333333:  14%|█▎        | 27/200 [5:19:18<34:05:54, 709.56s/it]

Correct: 2737, Total: 4800
early stopping





Correct: 3369, Total: 6000


0.5615

## Save the model!

In [2]:
# save the model
import torch
torch.save(trained_model.state_dict(), "trained_model.pt")

NameError: name 'trained_model' is not defined

## Load and evaluate


In [6]:
#load and evaluate the model
from models import VQA_Model4

trained_model = VQA_Model4(model, device) # need to choose the same model
trained_model.load_state_dict(torch.load("Trained_Models/best_model-v4-60.pt"))
#print(trained_model)
evaluate(trained_model, test_dataloader, device, test_size, show_progress=True)

VQA_Model4(
  (model): CLIP(
    (visual): VisionTransformer(
      (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (transformer): Transformer(
        (resblocks): Sequential(
          (0): ResidualAttentionBlock(
            (attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
            )
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): Sequential(
              (c_fc): Linear(in_features=768, out_features=3072, bias=True)
              (gelu): QuickGELU()
              (c_proj): Linear(in_features=3072, out_features=768, bias=True)
            )
            (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          )
          (1): ResidualAttentionBlock(
            (attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizable

In [None]:
"""
import clip
from PIL import Image
import torch

for model_name in clip.available_models():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using {device} device and model: {model_name}")
    model, preprocess = clip.load(model_name, device=device)
    model.to(torch.float32)
    text_tokens = clip.tokenize(["question_text"]).to(device)
    image = preprocess(Image.open("CLIP.jpg")).unsqueeze(0).to(device)
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_tokens)
    print(f"text_t {text_tokens.dtype}, text_f {text_features.dtype}, image_t {image.dtype}, image_f {image_features.dtype}")
    """


'\nimport clip\nfrom PIL import Image\nimport torch\n\nfor model_name in clip.available_models():\n    device = "cuda" if torch.cuda.is_available() else "cpu"\n    print(f"Using {device} device and model: {model_name}")\n    model, preprocess = clip.load(model_name, device=device)\n    model.to(torch.float32)\n    text_tokens = clip.tokenize(["question_text"]).to(device)\n    image = preprocess(Image.open("CLIP.jpg")).unsqueeze(0).to(device)\n    image_features = model.encode_image(image)\n    text_features = model.encode_text(text_tokens)\n    print(f"text_t {text_tokens.dtype}, text_f {text_features.dtype}, image_t {image.dtype}, image_f {image_features.dtype}")\n    '