## Load the dataset


In [1]:
from VQA_Dataset_CLIP import VQA_Dataset, VQA_Dataset_preloaded, VQA_Dataset_Sentences
import clip
import torch
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model, preprocess = clip.load("ViT-B/32", device=device)
print(preprocess.__class__.__name__)
model.to(torch.float32)
print("clip model loaded")

Using cuda device
Compose
clip model loaded


In [2]:
print(model)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

### 2.Precalc clip embedding to train faster

In [3]:
dataset_abstract_train = VQA_Dataset_preloaded()
#dataset_abstract_train.compute_store(preprocess, model, device, "dataset_abstract_train_60k_sentencesv2", name="train", length=60000, mode="scale", real=False, sentences=True)

In [4]:
dataset_real = VQA_Dataset_preloaded()
dataset_abstract_test = VQA_Dataset_preloaded()

### 3.Only load

In [5]:
from torch.utils.data import ConcatDataset
#dataset_train = VQA_Dataset_preloaded()
#dataset_train.load("full_", device, length=32755)# 248348)
sentences=True
adding=""
if sentences:
    adding="_sentences_"

dataset_real.load("dataset_real_75kv2"+adding, device, name="val", length=75000)
dataset_abstract_train.load("dataset_abstract_train_60kv2"+adding, device, name="train", length=60000)
dataset_abstract_test.load("dataset_abstract_test_15kv2"+adding, device, name="val", length=15000)
dataset_abstract = ConcatDataset([dataset_abstract_train, dataset_abstract_test])

In [6]:
#Combine abstract and real datasets
class CustomDataset(Dataset):
    def __init__(self, dataset, real=1):
        self.existing_dataset = dataset
        self.real = real

    def __len__(self):
        return len(self.existing_dataset)

    def __getitem__(self, index):
        return  self.existing_dataset[index]+(self.real, )
    
dataset_real_extended = CustomDataset(dataset_real, real=1)
dataset_abstract_extended = CustomDataset(dataset_abstract, real=0)

In [7]:
dataset_type = "combined"
if dataset_type == "real":
    dataset = dataset_real
elif dataset_type == "abstract":
    dataset = dataset_abstract
elif dataset_type == "combined":
    dataset_combined = ConcatDataset([dataset_real_extended, dataset_abstract_extended])

In [8]:
import tqdm
import numpy as np

#Contrastive approach
class ContrastiveDataset(Dataset):
    def __init__(self, dataset, relation_imbalaced):
        self.new_dataset = []
        for element in tqdm.tqdm(dataset):
            samples = []
            image, mc_answers, question, index_answer, answer_type, real_type = element

            excluded_index = int(index_answer.cpu().item())
            random_indexs_keep = np.random.choice([i for i in range(0, 18) if i != excluded_index], size=relation_imbalaced, replace=False)
              
            samples.append((image, mc_answers[excluded_index], question, 1, answer_type, real_type))
            
            for keep_index in random_indexs_keep:
                samples.append((image, mc_answers[keep_index], question, 0, answer_type, real_type))
            
            self.new_dataset.extend(samples)
        
    def __len__(self):
        return len(self.new_dataset)

    def __getitem__(self, index):
        return  self.new_dataset[index]

ratio_positive_negative = 4    
#data_combined_contrastive = ContrastiveDataset(dataset_combined, relation_imbalaced = ratio_positive_negative)

### Create Dataloader

## New with both Validation and Training Datasets from MSCOCO

In [10]:
batch_size=128

train_size = int(len(dataset_combined)*0.8*0.8)
val_size = int(len(dataset_combined)*0.8*0.2)
test_size = int(len(dataset_combined))-val_size-train_size
print("Train size: ", train_size)
print("Val size: ", val_size)
print("Test size: ", test_size)
generator = torch.Generator().manual_seed(42)
train_dataset_combined, val_dataset_combined, test_dataset_combined = random_split(dataset_combined, [train_size, val_size, test_size], generator=generator)

train_dataloader_combined = DataLoader(train_dataset_combined, batch_size=batch_size, shuffle=True)
test_dataloader_combined = DataLoader(test_dataset_combined, batch_size=batch_size, shuffle=True)
val_dataloader_combined = DataLoader(val_dataset_combined, batch_size=batch_size, shuffle=True)

Train size:  96000
Val size:  24000
Test size:  30000


In [11]:
train_dataset_constrastive = ContrastiveDataset(train_dataset_combined, relation_imbalaced = ratio_positive_negative)
val_dataset_constrastive = ContrastiveDataset(val_dataset_combined, relation_imbalaced = ratio_positive_negative)
test_dataset_constrastive = ContrastiveDataset(test_dataset_combined, relation_imbalaced = ratio_positive_negative)

train_dataloader = DataLoader(train_dataset_constrastive, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset_constrastive, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset_constrastive, batch_size=batch_size, shuffle=True)

100%|██████████| 96000/96000 [01:02<00:00, 1523.81it/s]
100%|██████████| 24000/24000 [00:20<00:00, 1186.74it/s]
100%|██████████| 30000/30000 [01:16<00:00, 390.14it/s] 


In [12]:
next(iter(train_dataloader))[3]

tensor([0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
        1, 1, 0, 0, 1, 1, 0, 1])

## Simple model architectue


In [49]:
from models import VQA_Model_Precalc_Zero, VQA_Model1_Precalc, VQA_Model_Precalc, VQA_Model4_Precalc, VQA_Model_Discr, VQA_Model_Discr_Siamese

## Evaluate

In [50]:
class performanceAnalysis:
    def __init__(self, dataset_type="combined"):
        self.combined = dataset_type=="combined"
        self.dataset_type = dataset_type
        self.counters = np.zeros((self.combined+1, 3)) 
        self.total = np.zeros((self.combined+1, 3))
    def it(self, correct, answer_types, real_types=None):#both numpy arrays
        if not self.combined:
            for i in range(len(self.counters[0])):
                self.counters[0][i]+=np.sum(np.logical_and(correct, answer_types==i))
                self.total[0][i]+=np.sum(answer_types==i)
        else:
            is_real = (real_types==1).reshape(len(real_types), -1)
            is_abstract = np.logical_not(is_real)
            for i in range(len(self.counters[0])):
                self.counters[0][i]+=np.sum(np.logical_and(np.logical_and(correct, answer_types==i), is_abstract))
                self.total[0][i]+=np.sum(np.logical_and(answer_types==i, is_abstract))

                self.counters[1][i]+=np.sum(np.logical_and(np.logical_and(correct, answer_types==i), is_real))
                self.total[1][i]+=np.sum(np.logical_and(answer_types==i, is_real))

    def get_accuracies(self):
        if not self.combined:
            dictionary = {self.dataset_type: {"yes/no": 100*self.counters[0][0]/self.total[0][0], "number": 100*self.counters[0][1]/self.total[0][1], "other": 100*self.counters[0][2]/self.total[0][2], "total": 100*sum(self.counters[0])/sum(self.total[0])}}
        else:
            dictionary_abstract = {"yes/no": 100*self.counters[0][0]/self.total[0][0], "number": 100*self.counters[0][1]/self.total[0][1], "other": 100*self.counters[0][2]/self.total[0][2], "total": 100*sum(self.counters[0])/sum(self.total[0])}
            dictionary_real = {"yes/no": 100*self.counters[1][0]/self.total[1][0], "number": 100*self.counters[1][1]/self.total[1][1], "other": 100*self.counters[1][2]/self.total[1][2], "total": 100*sum(self.counters[1])/sum(self.total[1])}
            dictionary = {"abstract": dictionary_abstract, "real": dictionary_real} 
        return dictionary
    
def printResults(dict_performance, performAnalysis):
    info = ""
    for i, element in enumerate(dict_performance):
        info += (element+" - ")
        info += f"total: {dict_performance[element]['total']:.2f}, yes/no: {dict_performance[element]['yes/no']:.2f}({performAnalysis.counters[i][0]}/{performAnalysis.total[i][0]}), number: {dict_performance[element]['number']:.2f}({performAnalysis.counters[i][1]}/{performAnalysis.total[i][1]}), other: {dict_performance[element]['other']:.2f}({performAnalysis.counters[i][2]}/{performAnalysis.total[i][2]})"
        if dataset_type=="combined" and i==0:
            info += "\n"
    print(info)

In [52]:
import tqdm

def evaluate(model, dataloader, device, show_progress=False, score_each=False):
    model.eval()
    performAnalysis = performanceAnalysis(dataset_type)
    correct = 0
    sig = torch.nn.Sigmoid()
    if show_progress:
        pbar = tqdm.tqdm(dataloader)
    else:
        pbar = dataloader

    for i, data in enumerate(pbar):
        image = data[0].to(device)
        answer_tokens = data[1].squeeze(0).to(device)
        
        correct_answer = torch.tensor([int(x) for x in data[3]]).to(device)
        question_tokens = data[2].squeeze(1).to(device)

        outputs = torch.zeros((answer_tokens.shape[0], answer_tokens.shape[1])).cuda()
        with torch.no_grad():
            
            for mc_answer_index in range(answer_tokens.shape[1]):
                outputs[:, mc_answer_index] = sig(model(image, question_tokens, answer_tokens[:, mc_answer_index, :]).squeeze(1))

            pred = outputs.argmax(dim=-1)
            # get sum of correct answers
            correct += (pred == correct_answer).sum().item()
            if dataset_type != "combined":
                performAnalysis.it((pred==correct_answer).cpu().numpy().reshape(len(pred), -1), data[4].cpu().numpy())
            else:
                performAnalysis.it((pred==correct_answer).cpu().numpy().reshape(len(pred), -1), data[4].cpu().numpy(), data[5].cpu().numpy())
    dict_performance = performAnalysis.get_accuracies()
    printResults(dict_performance, performAnalysis)
    return dict_performance

In [53]:
def evaluateSiamese(model, dataloader, device, show_progress=False, score_each=False):
    model.eval()
    performAnalysis = performanceAnalysis(dataset_type)

    if show_progress:
        pbar = tqdm.tqdm(dataloader)
    else:
        pbar = dataloader

    for i, data in enumerate(pbar):
        image = data[0].to(device)
        target = data[1].to(device)
        question_tokens = data[2].squeeze(1).to(device)

        with torch.no_grad():
            sig = torch.nn.Sigmoid()
            output = sig(model(image, question_tokens, target).squeeze(1))
            
            label = data[3].to(device)

            if dataset_type != "combined":
                performAnalysis.it(((output > 0.5)*label).cpu().numpy().reshape(len(output), -1), data[4].cpu().numpy())
            else:
                performAnalysis.it(((output > 0.5)*label).cpu().numpy().reshape(len(output), -1), data[4].cpu().numpy(), data[5].cpu().numpy())
    dict_performance = performAnalysis.get_accuracies()
    printResults(dict_performance, performAnalysis)
    return dict_performance


## Training

In [54]:
#Contrastive learning
def train_siamese(model, train_dataloader, val_dataloader, device, epochs=10, patience=3, precalculated=False, save_name="model", loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([ratio_positive_negative]).cuda())):
    print(device)
    print(model.parameters())
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)#, weight_decay=1e-4)

    prev_acc = 0.0
    best_acc = 0.0
    count = 0
    
    for epoch in range(epochs):
        inner_bar = tqdm.tqdm(train_dataloader, desc='Batch', colour='green')
        loss_accum = 0
        
        model.train()
        for data in inner_bar:
            
            image = data[0].to(device)
            target = data[1].to(device)
            question_tokens = data[2].squeeze(1).to(device)
        
            output = model(image, question_tokens, target)
            
            label = data[3].to(device)
            loss = loss_fn(output.squeeze(1), label.float())


            optimizer.zero_grad()
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=4.0, error_if_nonfinite=True)
            optimizer.step()

            loss_accum += loss.item()
        
        dic = evaluateSiamese(model, val_dataloader, device)
        acc = (dic["abstract"]["total"]+dic["real"]["total"])/2

        if acc > best_acc:
            best_acc = acc
            # save model
            model.save(save_name)
        if acc < prev_acc:
            count += 1
        else:
            count = 0
        prev_acc = acc
        print(f"Epoch {epoch} train_loss: {loss_accum/len(train_dataloader)}, patience: {count}")
        if count == patience:
            print("early stopping")
            break 
        inner_bar.close()

In [55]:
trained_model = VQA_Model_Discr_Siamese(model, device)
# freeze the clip model
for param in trained_model.model.parameters():
    param.requires_grad = False

#evaluate(trained_model, test_dataloader, device, test_size, show_progress=True)
improvement=adding+"v2"+"_contrastive"#+"_dropout2e-1"
filename_save = trained_model.__class__.__name__+"_"+dataset_type+improvement
train_siamese(trained_model, train_dataloader, val_dataloader, device, epochs=100, save_name=filename_save)
# evaluate the model
evaluateSiamese(trained_model, test_dataloader, device);

cuda
<generator object Module.parameters at 0x00000177A7379900>


Batch:   0%|[32m          [0m| 0/3750 [00:00<?, ?it/s]

Batch: 100%|[32m██████████[0m| 3750/3750 [00:26<00:00, 142.99it/s]


abstract - total: 16.60, yes/no: 19.12(4680.0/24475.0), number: 16.81(1411.0/8395.0), other: 14.25(3832.0/26900.0)
real - total: 16.13, yes/no: 19.35(4365.0/22560.0), number: 15.95(1219.0/7645.0), other: 13.77(4133.0/30025.0)
Epoch 0 train_loss: 0.7512066379944483, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.76it/s]


abstract - total: 17.30, yes/no: 19.16(4689.0/24475.0), number: 17.53(1472.0/8395.0), other: 15.53(4178.0/26900.0)
real - total: 16.62, yes/no: 19.14(4319.0/22560.0), number: 16.56(1266.0/7645.0), other: 14.74(4426.0/30025.0)
Epoch 1 train_loss: 0.6690324909845988, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.88it/s]


abstract - total: 17.90, yes/no: 19.55(4784.0/24475.0), number: 17.89(1502.0/8395.0), other: 16.40(4412.0/26900.0)
real - total: 17.06, yes/no: 19.50(4400.0/22560.0), number: 17.20(1315.0/7645.0), other: 15.19(4562.0/30025.0)
Epoch 2 train_loss: 0.638984479268392, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.27it/s]


abstract - total: 17.83, yes/no: 19.33(4731.0/24475.0), number: 18.63(1564.0/8395.0), other: 16.22(4364.0/26900.0)
real - total: 17.20, yes/no: 19.35(4365.0/22560.0), number: 18.02(1378.0/7645.0), other: 15.37(4615.0/30025.0)
Epoch 3 train_loss: 0.6186385079542795, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.51it/s]


abstract - total: 17.83, yes/no: 19.21(4702.0/24475.0), number: 18.27(1534.0/8395.0), other: 16.43(4420.0/26900.0)
real - total: 16.85, yes/no: 19.30(4354.0/22560.0), number: 17.23(1317.0/7645.0), other: 14.91(4477.0/30025.0)
Epoch 4 train_loss: 0.601695081226031, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.27it/s]


abstract - total: 17.98, yes/no: 19.64(4807.0/24475.0), number: 18.31(1537.0/8395.0), other: 16.38(4405.0/26900.0)
real - total: 16.91, yes/no: 19.55(4411.0/22560.0), number: 16.60(1269.0/7645.0), other: 14.99(4502.0/30025.0)
Epoch 5 train_loss: 0.5879533090909322, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.49it/s]


abstract - total: 18.34, yes/no: 19.73(4829.0/24475.0), number: 18.50(1553.0/8395.0), other: 17.02(4578.0/26900.0)
real - total: 17.37, yes/no: 19.72(4449.0/22560.0), number: 17.31(1323.0/7645.0), other: 15.63(4692.0/30025.0)
Epoch 6 train_loss: 0.5762225297451019, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.36it/s]


abstract - total: 17.89, yes/no: 19.62(4801.0/24475.0), number: 18.34(1540.0/8395.0), other: 16.17(4349.0/26900.0)
real - total: 16.43, yes/no: 19.32(4359.0/22560.0), number: 16.98(1298.0/7645.0), other: 14.11(4237.0/30025.0)
Epoch 7 train_loss: 0.565629390390714, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 152.23it/s]


abstract - total: 18.30, yes/no: 19.68(4817.0/24475.0), number: 18.78(1577.0/8395.0), other: 16.90(4546.0/26900.0)
real - total: 17.30, yes/no: 19.55(4410.0/22560.0), number: 17.67(1351.0/7645.0), other: 15.52(4660.0/30025.0)
Epoch 8 train_loss: 0.5560427691141764, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.87it/s]


abstract - total: 17.56, yes/no: 19.51(4776.0/24475.0), number: 17.59(1477.0/8395.0), other: 15.78(4245.0/26900.0)
real - total: 16.31, yes/no: 19.34(4364.0/22560.0), number: 16.70(1277.0/7645.0), other: 13.94(4184.0/30025.0)
Epoch 9 train_loss: 0.5476855791012446, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.36it/s]


abstract - total: 18.41, yes/no: 19.68(4816.0/24475.0), number: 18.12(1521.0/8395.0), other: 17.35(4667.0/26900.0)
real - total: 17.63, yes/no: 19.56(4413.0/22560.0), number: 17.36(1327.0/7645.0), other: 16.25(4880.0/30025.0)
Epoch 10 train_loss: 0.5391001469771067, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.38it/s]


abstract - total: 18.35, yes/no: 19.68(4816.0/24475.0), number: 18.45(1549.0/8395.0), other: 17.12(4605.0/26900.0)
real - total: 17.11, yes/no: 19.52(4403.0/22560.0), number: 17.25(1319.0/7645.0), other: 15.26(4582.0/30025.0)
Epoch 11 train_loss: 0.5320007536967596, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.28it/s]


abstract - total: 17.42, yes/no: 19.68(4817.0/24475.0), number: 17.26(1449.0/8395.0), other: 15.42(4147.0/26900.0)
real - total: 16.22, yes/no: 19.43(4384.0/22560.0), number: 15.92(1217.0/7645.0), other: 13.88(4167.0/30025.0)
Epoch 12 train_loss: 0.5244380723158518, patience: 2


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.80it/s]


abstract - total: 18.01, yes/no: 19.70(4821.0/24475.0), number: 17.89(1502.0/8395.0), other: 16.51(4440.0/26900.0)
real - total: 16.77, yes/no: 19.51(4402.0/22560.0), number: 17.23(1317.0/7645.0), other: 14.58(4379.0/30025.0)
Epoch 13 train_loss: 0.5177175584634145, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.44it/s]


abstract - total: 17.89, yes/no: 19.69(4818.0/24475.0), number: 17.57(1475.0/8395.0), other: 16.35(4398.0/26900.0)
real - total: 16.59, yes/no: 19.44(4385.0/22560.0), number: 16.39(1253.0/7645.0), other: 14.51(4356.0/30025.0)
Epoch 14 train_loss: 0.5107682067950566, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.57it/s]


abstract - total: 17.69, yes/no: 19.58(4793.0/24475.0), number: 17.97(1509.0/8395.0), other: 15.88(4271.0/26900.0)
real - total: 16.65, yes/no: 19.41(4378.0/22560.0), number: 16.89(1291.0/7645.0), other: 14.52(4359.0/30025.0)
Epoch 15 train_loss: 0.506152428038915, patience: 2


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.92it/s]


abstract - total: 17.72, yes/no: 19.53(4780.0/24475.0), number: 17.59(1477.0/8395.0), other: 16.12(4337.0/26900.0)
real - total: 16.83, yes/no: 19.47(4392.0/22560.0), number: 16.76(1281.0/7645.0), other: 14.86(4462.0/30025.0)
Epoch 16 train_loss: 0.499810741297404, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 151.07it/s]


abstract - total: 17.72, yes/no: 19.55(4784.0/24475.0), number: 18.05(1515.0/8395.0), other: 15.96(4292.0/26900.0)
real - total: 16.23, yes/no: 19.33(4361.0/22560.0), number: 16.10(1231.0/7645.0), other: 13.94(4184.0/30025.0)
Epoch 17 train_loss: 0.4943122979402542, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.30it/s]


abstract - total: 18.13, yes/no: 19.60(4797.0/24475.0), number: 18.37(1542.0/8395.0), other: 16.72(4498.0/26900.0)
real - total: 16.97, yes/no: 19.33(4361.0/22560.0), number: 17.19(1314.0/7645.0), other: 15.13(4544.0/30025.0)
Epoch 18 train_loss: 0.4887692974249522, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.62it/s]


abstract - total: 17.73, yes/no: 19.55(4786.0/24475.0), number: 18.28(1535.0/8395.0), other: 15.89(4275.0/26900.0)
real - total: 16.57, yes/no: 19.31(4356.0/22560.0), number: 16.86(1289.0/7645.0), other: 14.44(4336.0/30025.0)
Epoch 19 train_loss: 0.48428134436209996, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.24it/s]


abstract - total: 17.18, yes/no: 19.47(4765.0/24475.0), number: 17.55(1473.0/8395.0), other: 14.99(4033.0/26900.0)
real - total: 15.96, yes/no: 19.10(4309.0/22560.0), number: 16.30(1246.0/7645.0), other: 13.52(4058.0/30025.0)
Epoch 20 train_loss: 0.4796847851713498, patience: 2


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.78it/s]


abstract - total: 18.07, yes/no: 19.65(4810.0/24475.0), number: 17.32(1454.0/8395.0), other: 16.86(4535.0/26900.0)
real - total: 16.74, yes/no: 19.45(4387.0/22560.0), number: 16.34(1249.0/7645.0), other: 14.81(4448.0/30025.0)
Epoch 21 train_loss: 0.4736431425333023, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.44it/s]


abstract - total: 17.17, yes/no: 19.35(4737.0/24475.0), number: 17.11(1436.0/8395.0), other: 15.20(4089.0/26900.0)
real - total: 16.00, yes/no: 18.99(4284.0/22560.0), number: 16.09(1230.0/7645.0), other: 13.73(4121.0/30025.0)
Epoch 22 train_loss: 0.46983724038998287, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.41it/s]


abstract - total: 17.75, yes/no: 19.60(4796.0/24475.0), number: 17.77(1492.0/8395.0), other: 16.07(4324.0/26900.0)
real - total: 16.25, yes/no: 19.31(4357.0/22560.0), number: 16.19(1238.0/7645.0), other: 13.96(4190.0/30025.0)
Epoch 23 train_loss: 0.46603906942208606, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.50it/s]


abstract - total: 17.62, yes/no: 19.51(4775.0/24475.0), number: 17.50(1469.0/8395.0), other: 15.94(4288.0/26900.0)
real - total: 16.36, yes/no: 19.24(4341.0/22560.0), number: 16.39(1253.0/7645.0), other: 14.19(4261.0/30025.0)
Epoch 24 train_loss: 0.46140211900870004, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.40it/s]


abstract - total: 17.27, yes/no: 19.35(4736.0/24475.0), number: 17.50(1469.0/8395.0), other: 15.32(4120.0/26900.0)
real - total: 16.00, yes/no: 19.10(4309.0/22560.0), number: 16.23(1241.0/7645.0), other: 13.62(4089.0/30025.0)
Epoch 25 train_loss: 0.4580522923707962, patience: 2


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.74it/s]


abstract - total: 17.34, yes/no: 19.44(4757.0/24475.0), number: 16.83(1413.0/8395.0), other: 15.59(4193.0/26900.0)
real - total: 15.96, yes/no: 19.08(4305.0/22560.0), number: 15.67(1198.0/7645.0), other: 13.69(4111.0/30025.0)
Epoch 26 train_loss: 0.4537635591228803, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.47it/s]


abstract - total: 17.63, yes/no: 19.58(4793.0/24475.0), number: 17.39(1460.0/8395.0), other: 15.92(4283.0/26900.0)
real - total: 16.41, yes/no: 19.32(4359.0/22560.0), number: 16.43(1256.0/7645.0), other: 14.22(4269.0/30025.0)
Epoch 27 train_loss: 0.45024709061781565, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.72it/s]


abstract - total: 17.37, yes/no: 19.56(4788.0/24475.0), number: 16.46(1382.0/8395.0), other: 15.65(4211.0/26900.0)
real - total: 16.32, yes/no: 19.30(4355.0/22560.0), number: 15.88(1214.0/7645.0), other: 14.19(4261.0/30025.0)
Epoch 28 train_loss: 0.44735114761193595, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.18it/s]


abstract - total: 17.85, yes/no: 19.50(4772.0/24475.0), number: 18.20(1528.0/8395.0), other: 16.23(4366.0/26900.0)
real - total: 16.49, yes/no: 19.28(4350.0/22560.0), number: 16.76(1281.0/7645.0), other: 14.33(4303.0/30025.0)
Epoch 29 train_loss: 0.44416854898134867, patience: 0


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.01it/s]


abstract - total: 17.64, yes/no: 19.61(4799.0/24475.0), number: 17.53(1472.0/8395.0), other: 15.89(4274.0/26900.0)
real - total: 16.22, yes/no: 19.39(4375.0/22560.0), number: 16.06(1228.0/7645.0), other: 13.87(4165.0/30025.0)
Epoch 30 train_loss: 0.44030265264908475, patience: 1


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.48it/s]


abstract - total: 17.52, yes/no: 19.35(4736.0/24475.0), number: 17.37(1458.0/8395.0), other: 15.90(4277.0/26900.0)
real - total: 15.99, yes/no: 19.02(4291.0/22560.0), number: 15.63(1195.0/7645.0), other: 13.81(4146.0/30025.0)
Epoch 31 train_loss: 0.4379663781881332, patience: 2


Batch: 100%|[32m██████████[0m| 3750/3750 [00:24<00:00, 150.67it/s]


abstract - total: 17.55, yes/no: 19.54(4783.0/24475.0), number: 17.44(1464.0/8395.0), other: 15.78(4245.0/26900.0)
real - total: 15.85, yes/no: 19.24(4341.0/22560.0), number: 16.26(1243.0/7645.0), other: 13.20(3964.0/30025.0)
Epoch 32 train_loss: 0.43359036070108414, patience: 3
early stopping
abstract - total: 17.44, yes/no: 19.50(5972.0/30630.0), number: 17.53(1975.0/11265.0), other: 15.55(5260.0/33820.0)
real - total: 15.81, yes/no: 19.27(5388.0/27955.0), number: 15.77(1460.0/9260.0), other: 13.20(4893.0/37070.0)


In [56]:
evaluate(trained_model, test_dataloader_combined, device)

abstract - total: 42.48, yes/no: 51.93(3181.0/6126.0), number: 36.71(827.0/2253.0), other: 35.85(2425.0/6764.0)
real - total: 33.29, yes/no: 53.35(2983.0/5591.0), number: 19.71(365.0/1852.0), other: 21.55(1598.0/7414.0)


{'abstract': {'yes/no': 51.9262161279791,
  'number': 36.70661340434975,
  'other': 35.85156712004731,
  'total': 42.481674701182065},
 'real': {'yes/no': 53.35360400643892,
  'number': 19.70842332613391,
  'other': 21.553817102778527,
  'total': 33.2907047183146}}

## Save the model!

In [None]:
# save the model
trained_model.save("trained_model_")

## Load and evaluate


In [16]:
trained_model = VQA_Model_Precalc(model, device)
dataset_type="combined"
improvement=adding+"v2"#"_dropout2e-1"
filename_load = trained_model.__class__.__name__+"_"+dataset_type+improvement
trained_model.load(filename_load)
#print(trained_model)
evaluate(trained_model, test_dataloader, device, show_progress=True);

ga


100%|██████████| 469/469 [00:23<00:00, 19.76it/s]

abstract - total: 55.24, yes/no: 62.99(3859.0/6126.0), number: 43.81(987.0/2253.0), other: 52.03(3519.0/6764.0)
real - total: 47.01, yes/no: 59.26(3313.0/5591.0), number: 27.65(512.0/1852.0), other: 42.62(3160.0/7414.0)



