**First approach uses CLIP and T5 to encode text and image, after that a traditional FFNN is trained**

## Load the dataset


In [1]:
from VQA_Dataset_CLIP import VQA_Dataset, VQA_Dataset_preloaded, VQA_Dataset_Sentences
import clip
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model, preprocess = clip.load("ViT-B/32", device=device)
print(preprocess.__class__.__name__)
model.to(torch.float32)
print("clip model loaded")

Using cuda device
Compose
clip model loaded


In [2]:
print(model)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

### 2.Precalc clip embedding to train faster

In [3]:
dataset_real = VQA_Dataset_preloaded()
#dataset_real.compute_store(preprocess, model, device, "dataset_real_75k", name="val", length=75000, mode="scale", real=True)

In [4]:
dataset_abstract_train = VQA_Dataset_preloaded()
#dataset_abstract_train.compute_store(preprocess, model, device, "dataset_abstract_train_60k", name="train", length=60000, mode="scale", real=False)

dataset_abstract_test = VQA_Dataset_preloaded()
#dataset_abstract_test.compute_store(preprocess, model, device, "dataset_abstract_test_15k", name="val", length=15000, mode="scale", real=False)

### 3.Only load

In [5]:
from torch.utils.data import ConcatDataset
sentences=False
adding=""
if sentences:
    adding="_sentences_"

dataset_real.load("dataset_real_75k"+adding, device, name="val", length=75000)
dataset_abstract_train.load("dataset_abstract_train_60k"+adding, device, name="train", length=60000)
dataset_abstract_test.load("dataset_abstract_test_15k"+adding, device, name="val", length=15000)
dataset_abstract = ConcatDataset([dataset_abstract_train, dataset_abstract_test])

In [6]:
#Combine abstract and real datasets
class CustomDataset(Dataset):
    def __init__(self, dataset, real=1):
        self.existing_dataset = dataset
        self.real = real

    def __len__(self):
        return len(self.existing_dataset)

    def __getitem__(self, index):
        return  self.existing_dataset[index]+(self.real, )
    
dataset_real_extended = CustomDataset(dataset_real, real=1)
dataset_abstract_extended = CustomDataset(dataset_abstract, real=0)

In [7]:
dataset_type = "combined"
if dataset_type == "real":
    dataset = dataset_real
elif dataset_type == "abstract":
    dataset = dataset_abstract
elif dataset_type == "combined":
    dataset = dataset = ConcatDataset([dataset_real_extended, dataset_abstract_extended])

### Create Dataloader

## New with both Validation and Training Datasets from MSCOCO

In [8]:
batch_size=64

train_size = int(len(dataset)*0.8*0.8)
val_size = int(len(dataset)*0.8*0.2)
test_size = int(len(dataset))-val_size-train_size
print("Train size: ", train_size)
print("Val size: ", val_size)
print("Test size: ", test_size)
generator = torch.Generator().manual_seed(42)
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], generator=generator)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

Train size:  96000
Val size:  24000
Test size:  30000


## Simple model architectue


In [9]:
from models import VQA_Model_Precalc_Zero, VQA_Model1_Precalc, VQA_Model_Precalc

## Evaluate

In [10]:
import numpy as np
class performanceAnalysis:
    def __init__(self):
        self.counters = [0, 0, 0]
        self.total = [0, 0, 0]
    def it(self, correct, answer_types):#both numpy arrays
        for i in range(len(self.counters)):
            self.counters[i]+=np.sum(np.logical_and(correct, answer_types==i))
            self.total[i]+=np.sum(answer_types==i)
    def get_accuracies(self):
        dictionary = {"yes/no": 100*self.counters[0]/self.total[0], "number": 100*self.counters[1]/self.total[1], "other": 100*self.counters[2]/self.total[2], "total": 100*sum(self.counters)/sum(self.total)}
        return dictionary

In [11]:
class performanceAnalysis:
    def __init__(self, dataset_type="combined"):
        self.combined = dataset_type=="combined"
        self.dataset_type = dataset_type
        self.counters = np.zeros((self.combined+1, 3)) 
        self.total = np.zeros((self.combined+1, 3))
    def it(self, correct, answer_types, real_types=None):#both numpy arrays
        if not self.combined:
            for i in range(len(self.counters[0])):
                self.counters[0][i]+=np.sum(np.logical_and(correct, answer_types==i))
                self.total[0][i]+=np.sum(answer_types==i)
        else:
            is_real = (real_types==1).reshape(len(real_types), -1)
            is_abstract = np.logical_not(is_real)
            for i in range(len(self.counters[0])):
                self.counters[0][i]+=np.sum(np.logical_and(np.logical_and(correct, answer_types==i), is_abstract))
                self.total[0][i]+=np.sum(np.logical_and(answer_types==i, is_abstract))

                self.counters[1][i]+=np.sum(np.logical_and(np.logical_and(correct, answer_types==i), is_real))
                self.total[1][i]+=np.sum(np.logical_and(answer_types==i, is_real))

    def get_accuracies(self):
        if not self.combined:
            dictionary = {self.dataset_type: {"yes/no": 100*self.counters[0][0]/self.total[0][0], "number": 100*self.counters[0][1]/self.total[0][1], "other": 100*self.counters[0][2]/self.total[0][2], "total": 100*sum(self.counters[0])/sum(self.total[0])}}
        else:
            dictionary_abstract = {"yes/no": 100*self.counters[0][0]/self.total[0][0], "number": 100*self.counters[0][1]/self.total[0][1], "other": 100*self.counters[0][2]/self.total[0][2], "total": 100*sum(self.counters[0])/sum(self.total[0])}
            dictionary_real = {"yes/no": 100*self.counters[1][0]/self.total[1][0], "number": 100*self.counters[1][1]/self.total[1][1], "other": 100*self.counters[1][2]/self.total[1][2], "total": 100*sum(self.counters[1])/sum(self.total[1])}
            dictionary = {"abstract": dictionary_abstract, "real": dictionary_real} 
        return dictionary
    
def printResults(dict_performance, performAnalysis):
    info = ""
    for i, element in enumerate(dict_performance):
        info += (element+" - ")
        info += f"total: {dict_performance[element]['total']:.2f}, yes/no: {dict_performance[element]['yes/no']:.2f}({performAnalysis.counters[i][0]}/{performAnalysis.total[i][0]}), number: {dict_performance[element]['number']:.2f}({performAnalysis.counters[i][1]}/{performAnalysis.total[i][1]}), other: {dict_performance[element]['other']:.2f}({performAnalysis.counters[i][2]}/{performAnalysis.total[i][2]})"
        if dataset_type=="combined" and i==0:
            info += "\n"
    print(info)

In [12]:
import tqdm

def evaluate(model, dataloader, device, show_progress=False, score_each=False):
    model.eval()
    performAnalysis = performanceAnalysis(dataset_type)
    correct = 0
    if show_progress:
        pbar = tqdm.tqdm(dataloader)
    else:
        pbar = dataloader

    for i, data in enumerate(pbar):
        image = data[0].to(device)
        answer_tokens = data[1].squeeze(0).to(device)
        if dataloader.batch_size == 1:
            correct_answer = torch.tensor([int(data[3])]).to(device)
            question_tokens = data[2].squeeze(0).to(device)
        else:
            correct_answer = torch.tensor([int(x) for x in data[3]]).to(device)
            question_tokens = data[2].squeeze(1).to(device)

        with torch.no_grad():
            if score_each:
                # get similarity for each answer, not for each pair of batch
                if len(answer_tokens.shape) == 2:
                    answer_tokens = answer_tokens.unsqueeze(0)
                similarity = torch.zeros((answer_tokens.shape[0], answer_tokens.shape[1])).to(device)
                
                for answer in range(answer_tokens.shape[1]):    
                    similarity[:,answer] = model(image, question_tokens, answer_tokens[:, answer]).squeeze(1)
                    
                pred = similarity.argmax(dim=-1)
                # get sum of correct answers
                correct += (pred == correct_answer).sum().item()
            else:
                similarity = model(image, question_tokens, answer_tokens)
                pred = similarity.argmax(dim=-1)
                # get sum of correct answers
                correct += (pred == correct_answer).sum().item()
                if dataset_type != "combined":
                    performAnalysis.it((pred==correct_answer).cpu().numpy().reshape(len(pred), -1), data[4].cpu().numpy())
                else:
                    performAnalysis.it((pred==correct_answer).cpu().numpy().reshape(len(pred), -1), data[4].cpu().numpy(), data[5].cpu().numpy())
    dict_performance = performAnalysis.get_accuracies()
    printResults(dict_performance, performAnalysis)
    return dict_performance

In [14]:
combined_model = VQA_Model_Precalc_Zero(model, device)
evaluate(combined_model, test_dataloader, device, show_progress=True);

100%|██████████| 469/469 [00:13<00:00, 35.33it/s]

abstract - total: 18.63, yes/no: 41.79(2560.0/6126.0), number: 5.19(117.0/2253.0), other: 2.13(144.0/6764.0)
real - total: 15.26, yes/no: 35.75(1999.0/5591.0), number: 3.40(63.0/1852.0), other: 2.77(205.0/7414.0)





## Training

In [14]:

from torch.optim import lr_scheduler

def train(model, train_dataloader, val_dataloader, device, epochs=10, patience=3, precalculated=False, save_name="model", loss_fn = torch.nn.CrossEntropyLoss()):
    print(device)
    print(model.parameters())
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)#, weight_decay=1e-4)

    prev_acc = 0.0
    best_acc = 0.0
    count = 0
    
    for epoch in range(epochs):
        inner_bar = tqdm.tqdm(train_dataloader, desc='Batch', colour='green')
        loss_accum = 0
        
        model.train()
        for data in inner_bar:
            
            image = data[0].to(device)
            answer_tokens = data[1].squeeze(0).to(device)
            
            correct_answer = torch.tensor([int(x) for x in data[3]]).to(device)
            question_tokens = data[2].squeeze(1).to(device)
        
            similarity = model(image, question_tokens, answer_tokens)
            loss = loss_fn(similarity, correct_answer)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=4.0, error_if_nonfinite=True)
            optimizer.step()

            loss_accum += loss.item()
        
        if dataset_type=="combined":
            dic = evaluate(model, val_dataloader, device)
            acc = (dic["abstract"]["total"]+dic["real"]["total"])/2
        else:
            acc = evaluate(model, val_dataloader, device)[dataset_type]["total"]

        if acc > best_acc:
            best_acc = acc
            # save model
            model.save(save_name)
        if acc < prev_acc:
            count += 1
        else:
            count = 0
        prev_acc = acc
        print(f"Epoch {epoch} train_loss: {loss_accum/len(train_dataloader.dataset)}, patience: {count}")
        if count == patience:
            print("early stopping")
            break 
        inner_bar.close()

In [15]:
trained_model = VQA_Model_Precalc(model, device)
# freeze the clip model
for param in trained_model.model.parameters():
    param.requires_grad = False

#evaluate(trained_model, test_dataloader, device, test_size, show_progress=True)
improvement="_dropout2e-1"
filename_save = trained_model.__class__.__name__+"_"+dataset_type+improvement
train(trained_model, train_dataloader, val_dataloader, device, epochs=50, save_name=filename_save)
# evaluate the model
evaluate(trained_model, test_dataloader, device);

cuda
<generator object Module.parameters at 0x0000019AD7CA3D60>


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 36.98it/s]


abstract - total: 43.09, yes/no: 50.87(2490.0/4895.0), number: 39.55(664.0/1679.0), other: 37.12(1997.0/5380.0)
real - total: 37.92, yes/no: 53.35(2407.0/4512.0), number: 25.31(387.0/1529.0), other: 29.54(1774.0/6005.0)
Epoch 0 train_loss: 0.03169110848878821, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 37.14it/s]


abstract - total: 46.08, yes/no: 53.42(2615.0/4895.0), number: 40.38(678.0/1679.0), other: 41.19(2216.0/5380.0)
real - total: 39.55, yes/no: 50.55(2281.0/4512.0), number: 30.80(471.0/1529.0), other: 33.51(2012.0/6005.0)
Epoch 1 train_loss: 0.0261961359915634, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 37.00it/s]


abstract - total: 45.43, yes/no: 50.44(2469.0/4895.0), number: 41.99(705.0/1679.0), other: 41.95(2257.0/5380.0)
real - total: 36.32, yes/no: 42.22(1905.0/4512.0), number: 30.41(465.0/1529.0), other: 33.39(2005.0/6005.0)
Epoch 2 train_loss: 0.025215979812045892, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 36.89it/s]


abstract - total: 48.92, yes/no: 56.61(2771.0/4895.0), number: 40.32(677.0/1679.0), other: 44.61(2400.0/5380.0)
real - total: 38.69, yes/no: 49.69(2242.0/4512.0), number: 27.93(427.0/1529.0), other: 33.16(1991.0/6005.0)
Epoch 3 train_loss: 0.02460371733084321, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 36.93it/s]


abstract - total: 49.45, yes/no: 57.45(2812.0/4895.0), number: 37.34(627.0/1679.0), other: 45.95(2472.0/5380.0)
real - total: 41.17, yes/no: 56.47(2548.0/4512.0), number: 23.41(358.0/1529.0), other: 34.19(2053.0/6005.0)
Epoch 4 train_loss: 0.024219246556361516, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 36.81it/s]


abstract - total: 50.87, yes/no: 59.88(2931.0/4895.0), number: 40.14(674.0/1679.0), other: 46.02(2476.0/5380.0)
real - total: 42.56, yes/no: 58.84(2655.0/4512.0), number: 26.95(412.0/1529.0), other: 34.30(2060.0/6005.0)
Epoch 5 train_loss: 0.023652951436738172, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 36.65it/s]


abstract - total: 49.93, yes/no: 58.69(2873.0/4895.0), number: 42.23(709.0/1679.0), other: 44.37(2387.0/5380.0)
real - total: 42.11, yes/no: 57.78(2607.0/4512.0), number: 31.07(475.0/1529.0), other: 33.14(1990.0/6005.0)
Epoch 6 train_loss: 0.023340201104059814, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 36.76it/s]


abstract - total: 51.45, yes/no: 60.69(2971.0/4895.0), number: 42.64(716.0/1679.0), other: 45.78(2463.0/5380.0)
real - total: 43.56, yes/no: 57.87(2611.0/4512.0), number: 32.37(495.0/1529.0), other: 35.65(2141.0/6005.0)
Epoch 7 train_loss: 0.02314716814085841, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 36.99it/s]


abstract - total: 46.08, yes/no: 47.84(2342.0/4895.0), number: 40.02(672.0/1679.0), other: 46.38(2495.0/5380.0)
real - total: 36.57, yes/no: 40.78(1840.0/4512.0), number: 23.87(365.0/1529.0), other: 36.64(2200.0/6005.0)
Epoch 8 train_loss: 0.022980787717426815, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 37.35it/s]


abstract - total: 51.15, yes/no: 59.18(2897.0/4895.0), number: 42.58(715.0/1679.0), other: 46.52(2503.0/5380.0)
real - total: 43.58, yes/no: 58.80(2653.0/4512.0), number: 29.43(450.0/1529.0), other: 35.75(2147.0/6005.0)
Epoch 9 train_loss: 0.02274270424246788, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 37.04it/s]


abstract - total: 51.41, yes/no: 57.65(2822.0/4895.0), number: 41.51(697.0/1679.0), other: 48.81(2626.0/5380.0)
real - total: 42.92, yes/no: 54.39(2454.0/4512.0), number: 28.58(437.0/1529.0), other: 37.95(2279.0/6005.0)
Epoch 10 train_loss: 0.022475666747118036, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 36.97it/s]


abstract - total: 47.49, yes/no: 50.54(2474.0/4895.0), number: 43.00(722.0/1679.0), other: 46.12(2481.0/5380.0)
real - total: 41.88, yes/no: 51.73(2334.0/4512.0), number: 31.92(488.0/1529.0), other: 37.02(2223.0/6005.0)
Epoch 11 train_loss: 0.02224301761450867, patience: 2


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 37.22it/s]


abstract - total: 51.25, yes/no: 60.82(2977.0/4895.0), number: 43.18(725.0/1679.0), other: 45.06(2424.0/5380.0)
real - total: 44.08, yes/no: 58.75(2651.0/4512.0), number: 30.02(459.0/1529.0), other: 36.64(2200.0/6005.0)
Epoch 12 train_loss: 0.022083560163776078, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 36.59it/s]


abstract - total: 52.68, yes/no: 60.55(2964.0/4895.0), number: 43.12(724.0/1679.0), other: 48.49(2609.0/5380.0)
real - total: 45.33, yes/no: 58.27(2629.0/4512.0), number: 31.85(487.0/1529.0), other: 39.05(2345.0/6005.0)
Epoch 13 train_loss: 0.021970425662895043, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.01it/s]


abstract - total: 52.51, yes/no: 61.80(3025.0/4895.0), number: 39.79(668.0/1679.0), other: 48.03(2584.0/5380.0)
real - total: 44.50, yes/no: 58.64(2646.0/4512.0), number: 30.35(464.0/1529.0), other: 37.49(2251.0/6005.0)
Epoch 14 train_loss: 0.021886961438382667, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.54it/s]


abstract - total: 50.08, yes/no: 53.50(2619.0/4895.0), number: 44.91(754.0/1679.0), other: 48.59(2614.0/5380.0)
real - total: 38.93, yes/no: 44.90(2026.0/4512.0), number: 30.74(470.0/1529.0), other: 36.52(2193.0/6005.0)
Epoch 15 train_loss: 0.021735043473541736, patience: 2


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.22it/s]


abstract - total: 53.48, yes/no: 63.51(3109.0/4895.0), number: 35.50(596.0/1679.0), other: 49.96(2688.0/5380.0)
real - total: 44.60, yes/no: 60.46(2728.0/4512.0), number: 25.83(395.0/1529.0), other: 37.47(2250.0/6005.0)
Epoch 16 train_loss: 0.021601038705557584, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.30it/s]


abstract - total: 52.14, yes/no: 63.25(3096.0/4895.0), number: 36.09(606.0/1679.0), other: 47.04(2531.0/5380.0)
real - total: 44.83, yes/no: 60.31(2721.0/4512.0), number: 25.90(396.0/1529.0), other: 38.02(2283.0/6005.0)
Epoch 17 train_loss: 0.021509517557298144, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.29it/s]


abstract - total: 53.24, yes/no: 61.78(3024.0/4895.0), number: 42.11(707.0/1679.0), other: 48.94(2633.0/5380.0)
real - total: 44.65, yes/no: 58.11(2622.0/4512.0), number: 28.19(431.0/1529.0), other: 38.72(2325.0/6005.0)
Epoch 18 train_loss: 0.02143276438675821, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:40<00:00, 36.69it/s]


abstract - total: 50.59, yes/no: 54.77(2681.0/4895.0), number: 43.90(737.0/1679.0), other: 48.87(2629.0/5380.0)
real - total: 43.61, yes/no: 54.21(2446.0/4512.0), number: 33.03(505.0/1529.0), other: 38.33(2302.0/6005.0)
Epoch 19 train_loss: 0.021319886981820066, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.46it/s]


abstract - total: 52.88, yes/no: 60.82(2977.0/4895.0), number: 41.57(698.0/1679.0), other: 49.18(2646.0/5380.0)
real - total: 45.67, yes/no: 57.76(2606.0/4512.0), number: 30.74(470.0/1529.0), other: 40.38(2425.0/6005.0)
Epoch 20 train_loss: 0.021220025924344858, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.21it/s]


abstract - total: 52.01, yes/no: 60.65(2969.0/4895.0), number: 38.53(647.0/1679.0), other: 48.35(2601.0/5380.0)
real - total: 44.96, yes/no: 59.40(2680.0/4512.0), number: 30.48(466.0/1529.0), other: 37.80(2270.0/6005.0)
Epoch 21 train_loss: 0.02123267859344681, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.28it/s]


abstract - total: 53.26, yes/no: 63.45(3106.0/4895.0), number: 37.05(622.0/1679.0), other: 49.05(2639.0/5380.0)
real - total: 44.96, yes/no: 60.57(2733.0/4512.0), number: 27.93(427.0/1529.0), other: 37.57(2256.0/6005.0)
Epoch 22 train_loss: 0.02110851137402157, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.41it/s]


abstract - total: 50.91, yes/no: 53.40(2614.0/4895.0), number: 40.74(684.0/1679.0), other: 51.82(2788.0/5380.0)
real - total: 43.44, yes/no: 51.51(2324.0/4512.0), number: 33.09(506.0/1529.0), other: 40.02(2403.0/6005.0)
Epoch 23 train_loss: 0.021054163949564098, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.51it/s]


abstract - total: 49.86, yes/no: 52.77(2583.0/4895.0), number: 41.81(702.0/1679.0), other: 49.72(2675.0/5380.0)
real - total: 40.96, yes/no: 47.78(2156.0/4512.0), number: 28.71(439.0/1529.0), other: 38.95(2339.0/6005.0)
Epoch 24 train_loss: 0.020919770452504358, patience: 2


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.52it/s]


abstract - total: 51.56, yes/no: 60.61(2967.0/4895.0), number: 28.53(479.0/1679.0), other: 50.50(2717.0/5380.0)
real - total: 44.07, yes/no: 57.89(2612.0/4512.0), number: 22.96(351.0/1529.0), other: 39.07(2346.0/6005.0)
Epoch 25 train_loss: 0.020723029558236402, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.55it/s]


abstract - total: 53.34, yes/no: 61.37(3004.0/4895.0), number: 41.99(705.0/1679.0), other: 49.57(2667.0/5380.0)
real - total: 45.95, yes/no: 59.60(2689.0/4512.0), number: 31.00(474.0/1529.0), other: 39.50(2372.0/6005.0)
Epoch 26 train_loss: 0.020609913604333997, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:51<00:00, 29.20it/s]


abstract - total: 53.45, yes/no: 63.80(3123.0/4895.0), number: 37.28(626.0/1679.0), other: 49.09(2641.0/5380.0)
real - total: 44.69, yes/no: 59.44(2682.0/4512.0), number: 25.51(390.0/1529.0), other: 38.48(2311.0/6005.0)
Epoch 27 train_loss: 0.020686091057335338, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:49<00:00, 30.01it/s]


abstract - total: 53.45, yes/no: 60.12(2943.0/4895.0), number: 45.74(768.0/1679.0), other: 49.80(2679.0/5380.0)
real - total: 44.90, yes/no: 60.57(2733.0/4512.0), number: 32.18(492.0/1529.0), other: 36.37(2184.0/6005.0)
Epoch 28 train_loss: 0.020579366167386374, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:45<00:00, 32.80it/s]


abstract - total: 53.35, yes/no: 59.69(2922.0/4895.0), number: 43.48(730.0/1679.0), other: 50.65(2725.0/5380.0)
real - total: 44.74, yes/no: 55.98(2526.0/4512.0), number: 32.57(498.0/1529.0), other: 39.38(2365.0/6005.0)
Epoch 29 train_loss: 0.020576574767008422, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.54it/s]


abstract - total: 53.45, yes/no: 60.90(2981.0/4895.0), number: 39.96(671.0/1679.0), other: 50.89(2738.0/5380.0)
real - total: 45.85, yes/no: 59.04(2664.0/4512.0), number: 27.99(428.0/1529.0), other: 40.48(2431.0/6005.0)
Epoch 30 train_loss: 0.020471263483166694, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.45it/s]


abstract - total: 53.03, yes/no: 59.18(2897.0/4895.0), number: 43.48(730.0/1679.0), other: 50.41(2712.0/5380.0)
real - total: 46.57, yes/no: 57.23(2582.0/4512.0), number: 32.57(498.0/1529.0), other: 42.13(2530.0/6005.0)
Epoch 31 train_loss: 0.020357923549289506, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.60it/s]


abstract - total: 51.37, yes/no: 58.77(2877.0/4895.0), number: 36.51(613.0/1679.0), other: 49.28(2651.0/5380.0)
real - total: 44.94, yes/no: 57.89(2612.0/4512.0), number: 27.27(417.0/1529.0), other: 39.72(2385.0/6005.0)
Epoch 32 train_loss: 0.020339789257074397, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.74it/s]


abstract - total: 51.10, yes/no: 61.59(3015.0/4895.0), number: 25.31(425.0/1679.0), other: 49.61(2669.0/5380.0)
real - total: 44.75, yes/no: 59.24(2673.0/4512.0), number: 22.17(339.0/1529.0), other: 39.60(2378.0/6005.0)
Epoch 33 train_loss: 0.02030470875153939, patience: 2


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.10it/s]


abstract - total: 53.92, yes/no: 63.37(3102.0/4895.0), number: 41.69(700.0/1679.0), other: 49.14(2644.0/5380.0)
real - total: 45.45, yes/no: 59.29(2675.0/4512.0), number: 31.79(486.0/1529.0), other: 38.53(2314.0/6005.0)
Epoch 34 train_loss: 0.020252415052304667, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.17it/s]


abstract - total: 55.04, yes/no: 62.72(3070.0/4895.0), number: 40.20(675.0/1679.0), other: 52.68(2834.0/5380.0)
real - total: 45.06, yes/no: 58.93(2659.0/4512.0), number: 25.11(384.0/1529.0), other: 39.72(2385.0/6005.0)
Epoch 35 train_loss: 0.020121688252935806, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.44it/s]


abstract - total: 54.63, yes/no: 62.76(3072.0/4895.0), number: 43.78(735.0/1679.0), other: 50.63(2724.0/5380.0)
real - total: 46.11, yes/no: 59.69(2693.0/4512.0), number: 29.82(456.0/1529.0), other: 40.05(2405.0/6005.0)
Epoch 36 train_loss: 0.020267752868433794, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.47it/s]


abstract - total: 55.00, yes/no: 62.98(3083.0/4895.0), number: 44.31(744.0/1679.0), other: 51.08(2748.0/5380.0)
real - total: 46.47, yes/no: 60.64(2736.0/4512.0), number: 32.18(492.0/1529.0), other: 39.47(2370.0/6005.0)
Epoch 37 train_loss: 0.02000300668552518, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.56it/s]


abstract - total: 52.92, yes/no: 59.20(2898.0/4895.0), number: 43.84(736.0/1679.0), other: 50.04(2692.0/5380.0)
real - total: 45.29, yes/no: 58.78(2652.0/4512.0), number: 32.96(504.0/1529.0), other: 38.30(2300.0/6005.0)
Epoch 38 train_loss: 0.0199553440498809, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.23it/s]


abstract - total: 51.91, yes/no: 59.82(2928.0/4895.0), number: 36.27(609.0/1679.0), other: 49.59(2668.0/5380.0)
real - total: 43.06, yes/no: 54.48(2458.0/4512.0), number: 27.53(421.0/1529.0), other: 38.43(2308.0/6005.0)
Epoch 39 train_loss: 0.019888405011345943, patience: 2


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.44it/s]


abstract - total: 55.40, yes/no: 63.19(3093.0/4895.0), number: 43.54(731.0/1679.0), other: 52.03(2799.0/5380.0)
real - total: 46.03, yes/no: 60.35(2723.0/4512.0), number: 31.13(476.0/1529.0), other: 39.07(2346.0/6005.0)
Epoch 40 train_loss: 0.019893219674626986, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.35it/s]


abstract - total: 47.43, yes/no: 49.72(2434.0/4895.0), number: 38.24(642.0/1679.0), other: 48.22(2594.0/5380.0)
real - total: 39.12, yes/no: 44.37(2002.0/4512.0), number: 27.67(423.0/1529.0), other: 38.08(2287.0/6005.0)
Epoch 41 train_loss: 0.019893553449461858, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.28it/s]


abstract - total: 54.04, yes/no: 62.94(3081.0/4895.0), number: 44.73(751.0/1679.0), other: 48.85(2628.0/5380.0)
real - total: 45.43, yes/no: 60.82(2744.0/4512.0), number: 29.76(455.0/1529.0), other: 37.87(2274.0/6005.0)
Epoch 42 train_loss: 0.01987553517272075, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.55it/s]


abstract - total: 50.11, yes/no: 58.06(2842.0/4895.0), number: 29.06(488.0/1679.0), other: 49.44(2660.0/5380.0)
real - total: 43.04, yes/no: 55.12(2487.0/4512.0), number: 24.07(368.0/1529.0), other: 38.80(2330.0/6005.0)
Epoch 43 train_loss: 0.01982358392390112, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.39it/s]


abstract - total: 52.25, yes/no: 60.82(2977.0/4895.0), number: 40.02(672.0/1679.0), other: 48.27(2597.0/5380.0)
real - total: 44.80, yes/no: 58.27(2629.0/4512.0), number: 29.04(444.0/1529.0), other: 38.70(2324.0/6005.0)
Epoch 44 train_loss: 0.019824027887855967, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.79it/s]


abstract - total: 52.64, yes/no: 64.23(3144.0/4895.0), number: 28.83(484.0/1679.0), other: 49.52(2664.0/5380.0)
real - total: 44.40, yes/no: 59.60(2689.0/4512.0), number: 19.82(303.0/1529.0), other: 39.23(2356.0/6005.0)
Epoch 45 train_loss: 0.019707053136701385, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.34it/s]


abstract - total: 52.97, yes/no: 59.22(2899.0/4895.0), number: 42.64(716.0/1679.0), other: 50.50(2717.0/5380.0)
real - total: 44.84, yes/no: 55.98(2526.0/4512.0), number: 31.98(489.0/1529.0), other: 39.73(2386.0/6005.0)
Epoch 46 train_loss: 0.019568767715245485, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.53it/s]


abstract - total: 54.04, yes/no: 64.74(3169.0/4895.0), number: 40.50(680.0/1679.0), other: 48.53(2611.0/5380.0)
real - total: 45.30, yes/no: 60.46(2728.0/4512.0), number: 28.71(439.0/1529.0), other: 38.13(2290.0/6005.0)
Epoch 47 train_loss: 0.019645010529085995, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:38<00:00, 38.50it/s]


abstract - total: 55.06, yes/no: 63.80(3123.0/4895.0), number: 44.13(741.0/1679.0), other: 50.52(2718.0/5380.0)
real - total: 46.16, yes/no: 59.49(2684.0/4512.0), number: 30.09(460.0/1529.0), other: 40.25(2417.0/6005.0)
Epoch 48 train_loss: 0.01946604947807888, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:39<00:00, 38.43it/s]


abstract - total: 53.66, yes/no: 59.80(2927.0/4895.0), number: 45.98(772.0/1679.0), other: 50.46(2715.0/5380.0)
real - total: 45.07, yes/no: 57.03(2573.0/4512.0), number: 30.35(464.0/1529.0), other: 39.83(2392.0/6005.0)
Epoch 49 train_loss: 0.019484972908472023, patience: 1
abstract - total: 52.88, yes/no: 59.39(3638.0/6126.0), number: 44.25(997.0/2253.0), other: 49.85(3372.0/6764.0)
real - total: 44.80, yes/no: 56.36(3151.0/5591.0), number: 29.59(548.0/1852.0), other: 39.88(2957.0/7414.0)


In [16]:
trained_model = VQA_Model1_Precalc(model, device)
# freeze the clip model
for param in trained_model.model.parameters():
    param.requires_grad = False

#evaluate(trained_model, test_dataloader, device, test_size, show_progress=True)
filename_save = trained_model.__class__.__name__+"_"+dataset_type
train(trained_model, train_dataloader, val_dataloader, device, epochs=50, save_name=filename_save)
# evaluate the model
evaluate(trained_model, test_dataloader, device);

cuda
<generator object Module.parameters at 0x00000237D7CE1CF0>


Batch: 100%|[32m██████████[0m| 1500/1500 [00:42<00:00, 35.09it/s]


abstract - total: 48.98, yes/no: 54.71(2678.0/4895.0), number: 33.11(556.0/1679.0), other: 48.72(2621.0/5380.0)
real - total: 44.05, yes/no: 46.10(2080.0/4512.0), number: 25.18(385.0/1529.0), other: 47.31(2841.0/6005.0)
Epoch 0 train_loss: 0.12611973020186026, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.88it/s]


abstract - total: 51.50, yes/no: 55.02(2693.0/4895.0), number: 39.55(664.0/1679.0), other: 52.03(2799.0/5380.0)
real - total: 45.83, yes/no: 47.94(2163.0/4512.0), number: 25.83(395.0/1529.0), other: 49.34(2963.0/6005.0)
Epoch 1 train_loss: 0.0976900674564143, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.37it/s]


abstract - total: 51.73, yes/no: 56.28(2755.0/4895.0), number: 32.16(540.0/1679.0), other: 53.70(2889.0/5380.0)
real - total: 46.37, yes/no: 49.49(2233.0/4512.0), number: 25.83(395.0/1529.0), other: 49.26(2958.0/6005.0)
Epoch 2 train_loss: 0.08593888439858953, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.36it/s]


abstract - total: 53.91, yes/no: 54.34(2660.0/4895.0), number: 40.56(681.0/1679.0), other: 57.68(3103.0/5380.0)
real - total: 47.09, yes/no: 49.40(2229.0/4512.0), number: 25.96(397.0/1529.0), other: 50.74(3047.0/6005.0)
Epoch 3 train_loss: 0.0791211490308245, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.39it/s]


abstract - total: 50.78, yes/no: 53.85(2636.0/4895.0), number: 16.50(277.0/1679.0), other: 58.68(3157.0/5380.0)
real - total: 46.14, yes/no: 47.32(2135.0/4512.0), number: 20.67(316.0/1529.0), other: 51.74(3107.0/6005.0)
Epoch 4 train_loss: 0.07502671575918794, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.41it/s]


abstract - total: 52.81, yes/no: 55.61(2722.0/4895.0), number: 36.99(621.0/1679.0), other: 55.20(2970.0/5380.0)
real - total: 46.87, yes/no: 46.59(2102.0/4512.0), number: 31.20(477.0/1529.0), other: 51.07(3067.0/6005.0)
Epoch 5 train_loss: 0.07304213851938646, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.39it/s]


abstract - total: 51.77, yes/no: 52.77(2583.0/4895.0), number: 35.14(590.0/1679.0), other: 56.06(3016.0/5380.0)
real - total: 47.97, yes/no: 51.44(2321.0/4512.0), number: 25.05(383.0/1529.0), other: 51.19(3074.0/6005.0)
Epoch 6 train_loss: 0.06949281831085682, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.59it/s]


abstract - total: 51.51, yes/no: 50.34(2464.0/4895.0), number: 31.27(525.0/1679.0), other: 58.90(3169.0/5380.0)
real - total: 48.01, yes/no: 51.77(2336.0/4512.0), number: 23.81(364.0/1529.0), other: 51.34(3083.0/6005.0)
Epoch 7 train_loss: 0.06740236510212223, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.46it/s]


abstract - total: 52.02, yes/no: 48.52(2375.0/4895.0), number: 42.47(713.0/1679.0), other: 58.20(3131.0/5380.0)
real - total: 47.60, yes/no: 49.71(2243.0/4512.0), number: 28.12(430.0/1529.0), other: 50.97(3061.0/6005.0)
Epoch 8 train_loss: 0.06696939444666107, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.44it/s]


abstract - total: 55.24, yes/no: 57.85(2832.0/4895.0), number: 38.42(645.0/1679.0), other: 58.10(3126.0/5380.0)
real - total: 50.02, yes/no: 57.47(2593.0/4512.0), number: 28.32(433.0/1529.0), other: 49.94(2999.0/6005.0)
Epoch 9 train_loss: 0.06432202647191783, patience: 0


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.27it/s]


abstract - total: 54.00, yes/no: 59.51(2913.0/4895.0), number: 30.49(512.0/1679.0), other: 56.32(3030.0/5380.0)
real - total: 49.48, yes/no: 56.07(2530.0/4512.0), number: 28.32(433.0/1529.0), other: 49.91(2997.0/6005.0)
Epoch 10 train_loss: 0.06319272099683682, patience: 1


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.49it/s]


abstract - total: 55.49, yes/no: 58.20(2849.0/4895.0), number: 37.22(625.0/1679.0), other: 58.72(3159.0/5380.0)
real - total: 47.58, yes/no: 51.53(2325.0/4512.0), number: 25.77(394.0/1529.0), other: 50.16(3012.0/6005.0)
Epoch 11 train_loss: 0.0626291747558862, patience: 2


Batch: 100%|[32m██████████[0m| 1500/1500 [00:43<00:00, 34.32it/s]


abstract - total: 52.93, yes/no: 55.53(2718.0/4895.0), number: 28.65(481.0/1679.0), other: 58.14(3128.0/5380.0)
real - total: 47.48, yes/no: 50.64(2285.0/4512.0), number: 27.60(422.0/1529.0), other: 50.17(3013.0/6005.0)
Epoch 12 train_loss: 0.061971628256142136, patience: 3
early stopping
abstract - total: 53.29, yes/no: 55.24(3384.0/6126.0), number: 30.14(679.0/2253.0), other: 59.24(4007.0/6764.0)
real - total: 47.22, yes/no: 49.44(2764.0/5591.0), number: 26.62(493.0/1852.0), other: 50.69(3758.0/7414.0)


## Save the model!

In [None]:
# save the model
trained_model.save("trained_model_")

## Load and evaluate


In [17]:
trained_model = VQA_Model_Precalc(model, device) # need to choose the same model!
dataset_type="combined"
improvement="_dropout2e-1"
filename_load = trained_model.__class__.__name__+"_"+dataset_type+improvement
trained_model.load(filename_load)
#print(trained_model)
evaluate(trained_model, test_dataloader, device, show_progress=True);

100%|██████████| 469/469 [00:12<00:00, 36.59it/s]

abstract - total: 54.31, yes/no: 62.00(3798.0/6126.0), number: 43.41(978.0/2253.0), other: 50.98(3448.0/6764.0)
real - total: 46.01, yes/no: 59.79(3343.0/5591.0), number: 29.54(547.0/1852.0), other: 39.74(2946.0/7414.0)



