In [9]:
import torch
import wandb
import numpy as np
import pandas as pd
import transformers
import os
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from PIL import Image
from torch import nn
from tqdm import tqdm
from datasets import load_dataset
from torchvision import transforms, models
from transformers import BertTokenizer

In [9]:
pip install jupyterlab_widgets==2.0.0a0


Collecting jupyterlab_widgets==2.0.0a0
  Downloading jupyterlab_widgets-2.0.0a0-py3-none-any.whl.metadata (3.5 kB)
Downloading jupyterlab_widgets-2.0.0a0-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.2/259.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: jupyterlab_widgets
  Attempting uninstall: jupyterlab_widgets
    Found existing installation: jupyterlab-widgets 3.0.9
    Uninstalling jupyterlab-widgets-3.0.9:
      Successfully uninstalled jupyterlab-widgets-3.0.9
Successfully installed jupyterlab_widgets-2.0.0a0
Note: you may need to restart the kernel to use updated packages.


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [5]:
from kaggle_secrets import UserSecretsClient
api_key = "wandb-api-key"
wandb.login(key=UserSecretsClient().get_secret(api_key), relogin=True)

BackendError: Unexpected response from the service. Response: {'errors': ['No user secrets exist for kernel id 72318224 and label wandb-api-key.'], 'error': {'code': 5, 'details': []}, 'wasSuccessful': False}.

In [21]:
with open(os.path.join("/kaggle/input/visual-question-answering-computer-vision-nlp/dataset", "answer_space.txt")) as f:
            # print(f.read())
            answer_space = f.read().splitlines()
            
print(len(answer_space))      

582


In [21]:
class VQADataset(Dataset):
    def __init__(self, csv_file, image_folder, transform=None):
        # Load dataset from CSV
        self.data = pd.read_csv(csv_file)
        self.image_folder = image_folder
        self.transform = transform
        # Initialize the tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        # Preprocess the dataset
        self.preprocess_dataset()

    def preprocess_dataset(self):
        dataset = load_dataset(
            "csv",
            data_files={
                "train": os.path.join("/kaggle/input/visual-question-answering-computer-vision-nlp/dataset", "data_train.csv"),
                "test": os.path.join("/kaggle/input/visual-question-answering-computer-vision-nlp/dataset", "data_eval.csv")
            }
        )
        # print(dataset)

        with open(os.path.join("/kaggle/input/visual-question-answering-computer-vision-nlp/dataset", "answer_space.txt")) as f:
            answer_space = f.read().splitlines()

        self.data = dataset.map(
            lambda examples: {
                'label': [
                    answer_space.index(ans.replace(" ", "").split(",")[0])  
                    for ans in examples['answer']
                ]
            },
            batched=True
        )

        # Convert dataset to DataFrame for easy indexing
        self.data = pd.DataFrame(self.data['train'])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_id = row['image_id']
        question = row['question']
        label = row['label']

        image_path = os.path.join(self.image_folder, f"{image_id}.png")
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        # Tokenize the question
        inputs = self.tokenizer(question, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        # print(f'inputs: {inputs}')
        
        # Ensure the tensors are in the correct format for the DataLoader
        input_ids = inputs['input_ids'].squeeze(0)  # Remove batch dimension
        # print(input_ids.shape)
        attention_mask = inputs['attention_mask'].squeeze(0)  # Remove batch dimension
        # print(f'attention_mask: {attention_mask.shape}')

        return image, input_ids, attention_mask, label

In [17]:
image, input_ids, attention_mask, label = train_dataset[0]

# Print the outputs
print(f"Image: {image.shape}")
print(f"Input IDs: {input_ids.shape}")
print(f"Attention Mask: {attention_mask.shape}")
print(f"Label: {label}")

inputs: {'input_ids': tensor([[  101,  2054,  2003,  1996,  4874,  2006,  1996, 15475,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0, 

In [22]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


train_dataset = VQADataset('/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/data_train.csv', '/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/images', transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
val_dataset = VQADataset('/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/data_eval.csv', '/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/images', transform)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

In [28]:
import torch
import torch.nn as nn
from transformers import BertModel
from torchvision import models
from torch.cuda.amp import autocast, GradScaler
from torch.utils.checkpoint import checkpoint
from tqdm import tqdm
import matplotlib.pyplot as plt
import json

class VQAModel(nn.Module):
    def __init__(self, num_answers):
        super(VQAModel, self).__init__()
        # Image feature extractor
        self.cnn = models.resnet50(pretrained=True)
        self.cnn.fc = nn.Identity()  # Remove the final classification layer

        # Question feature extractor
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Fusion and final classification
        self.fc1 = nn.Linear(2048 + 768, 1024)
        self.fc2 = nn.Linear(1024, num_answers)
        self.dropout = nn.Dropout(0.5)

    def forward(self, images, input_ids, attention_mask):
        # Extract image features
        image_features = checkpoint(self.cnn, images)  # Use checkpointing for ResNet
        # print(f'image_features {image_features}')
        # print(f'shape of image feature is {image_features.shape}') #([64, 2048])
        # Extract question features
        outputs = checkpoint(self.bert, input_ids, attention_mask)
        question_features = outputs.last_hidden_state[:, 0, :]  # shape: (batch_size, 768) #CLS TOKEN
        # print(f'question feature shape {question_features.shape}')
        # Concatenate features
        combined_features = torch.cat((image_features, question_features), dim=1)
        # print(f'combined_features shape {combined_features.shape}') #([64, 2816])
        # Classification
        x = self.fc1(combined_features)
        # print(f' 1 *************** {x.shape}') #([64, 1024])
        x = self.dropout(x)
        # print(f'dropout {x.shape} ')#([64, 1024])
        x = self.fc2(x)
        # print(f'final layer {x.shape}') #([64, 582])
        
        return x

In [29]:
import torch.optim as optim
from torch.nn import CrossEntropyLoss

# Initialize the model
model = VQAModel(num_answers=582)  #print(len(answer_space)) length of answer space is 582
model.to(device)

# Training parameters
num_epochs = 100
lr = 0.0005 
weight_decay = 1e-4
best_loss = float('inf')
best_model_state = None
patience = 10  # Number of epochs to wait for improvement before stopping
early_stopping_counter = 0

# Initialize the optimizer and GradScaler for mixed precision training
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
scaler = GradScaler()

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define scheduler
scheduler_step_size = int(num_epochs * 0.25)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=scheduler_step_size)

In [13]:
pip install --upgrade ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.4/214.4 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets


In [None]:
# def calculate_accuracy(outputs, labels):
#     _, preds = torch.max(outputs, 1)
#     correct = (preds == labels).sum().item()
#     total = labels.size(0)
#     return correct / total

In [30]:
def calculate_accuracy(outputs, labels):
        _, preds = torch.max(outputs, 1)
        correct = (preds == labels).float().sum()
        accuracy = correct / labels.size(0)
        return accuracy.item()

In [31]:
import torch
from tqdm import tqdm
import wandb

def train_model(model, train_loader, val_loader, criterion, optimizer, scaler, device, num_epochs, patience, save_path, project_name):
    # Initialize wandb
    wandb.init(project=project_name)
    wandb.watch(model, log="all")

    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    best_loss = float('inf')
    early_stopping_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_accuracy = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")

        for images, input_ids, attention_mask, labels in progress_bar:
            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                outputs = model(images, input_ids, attention_mask)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item()
            accuracy = calculate_accuracy(outputs, labels)
            running_accuracy += accuracy

        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = running_accuracy / len(train_loader)

        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_accuracy)

        wandb.log({"train_loss": epoch_loss, "train_accuracy": epoch_accuracy, "epoch": epoch+1})

        model.eval()
        val_running_loss = 0.0
        val_running_accuracy = 0.0

        with torch.no_grad():
            val_progress_bar = tqdm(val_loader, desc="Validating", unit="batch")
            for images, input_ids, attention_mask, labels in val_progress_bar:
                images = images.to(device)
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                with torch.cuda.amp.autocast():
                    outputs = model(images, input_ids, attention_mask)
                    loss = criterion(outputs, labels)

                val_running_loss += loss.item()
                accuracy = calculate_accuracy(outputs, labels)
                val_running_accuracy += accuracy

        val_loss = val_running_loss / len(val_loader)
        val_accuracy = val_running_accuracy / len(val_loader)

        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy, "epoch": epoch+1})

        print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss}, Training Accuracy: {epoch_accuracy}")
        print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}")

        # Check if the validation loss improved
        if val_loss < best_loss:
            best_loss = val_loss
            best_model_state = model.state_dict()
            early_stopping_counter = 0  # Reset counter if we get a new best loss
            print(f"Saving model with lowest validation loss: {best_loss:.4f}")
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scaler_state_dict': scaler.state_dict(),
                'best_loss': best_loss,
                'train_losses': train_losses,
                'val_losses': val_losses,
                'train_accuracies': train_accuracies,
                'val_accuracies': val_accuracies
            }, save_path)
        else:
            early_stopping_counter += 1
            print(f"No improvement in validation loss for {early_stopping_counter} epochs.")

        # Check for early stopping
        if early_stopping_counter >= patience:
            print("Early stopping triggered.")
            break

    # Save the final metrics
    metrics = {
        "train_losses": train_losses,
        "val_losses": val_losses,
        "train_accuracies": train_accuracies,
        "val_accuracies": val_accuracies
    }

    wandb.finish()
    
    return metrics

def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    running_accuracy = 0.0

    def calculate_accuracy(outputs, labels):
        _, preds = torch.max(outputs, 1)
        correct = (preds == labels).float().sum()
        accuracy = correct / labels.size(0)
        return accuracy.item()

    with torch.no_grad():
        progress_bar = tqdm(data_loader, desc="Evaluating", unit="batch")
        for images, input_ids, attention_mask, labels in progress_bar:
            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            with torch.cuda.amp.autocast():
                outputs = model(images, input_ids, attention_mask)
                loss = criterion(outputs, labels)

            running_loss += loss.item()
            accuracy = calculate_accuracy(outputs, labels)
            running_accuracy += accuracy

    loss = running_loss / len(data_loader)
    accuracy = running_accuracy / len(data_loader)

    print(f"Loss: {loss}, Accuracy: {accuracy}")

    return loss, accuracy

In [None]:
import json

metrics = train_model(model, train_loader, val_loader, criterion, optimizer, scaler, device, num_epochs, patience, "/kaggle/working/best_model.pth", "VQA_BASELINE(ResNet50-Bert)")
with open("/kaggle/working/metrics.json", "w") as f:
    json.dump(metrics, f)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Epoch 1/100: 100%|██████████| 156/156 [03:00<00:00,  1.16s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 1/100, Training Loss: 4.9186168511708575, Training Accuracy: 0.0853105709911921
Validation Loss: 3.894217917552361, Validation Accuracy: 0.1962362001530635
Saving model with lowest validation loss: 3.8942


Epoch 2/100: 100%|██████████| 156/156 [03:01<00:00,  1.16s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 2/100, Training Loss: 4.0538977323434295, Training Accuracy: 0.16583199789508796
Validation Loss: 3.426269933199271, Validation Accuracy: 0.22587621670502883
Saving model with lowest validation loss: 3.4263


Epoch 3/100: 100%|██████████| 156/156 [03:01<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.12s/batch]


Epoch 3/100, Training Loss: 3.690166189120366, Training Accuracy: 0.19864004630690965
Validation Loss: 3.030592001401461, Validation Accuracy: 0.27243218781092227
Saving model with lowest validation loss: 3.0306


Epoch 4/100: 100%|██████████| 156/156 [03:03<00:00,  1.18s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 4/100, Training Loss: 3.4121843729263697, Training Accuracy: 0.22630282529653648
Validation Loss: 2.7697571974534254, Validation Accuracy: 0.3249458393607384
Saving model with lowest validation loss: 2.7698


Epoch 5/100: 100%|██████████| 156/156 [03:01<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 5/100, Training Loss: 3.1857179754819627, Training Accuracy: 0.24784840929966706
Validation Loss: 2.466833336231036, Validation Accuracy: 0.35424827879820114
Saving model with lowest validation loss: 2.4668


Epoch 6/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 6/100, Training Loss: 3.044602938187428, Training Accuracy: 0.267342562858875
Validation Loss: 2.333891448302147, Validation Accuracy: 0.37851302822430927
Saving model with lowest validation loss: 2.3339


Epoch 7/100: 100%|██████████| 156/156 [03:03<00:00,  1.18s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 7/100, Training Loss: 2.8677846988042197, Training Accuracy: 0.28325691475318027
Validation Loss: 2.2026509520335074, Validation Accuracy: 0.392754332950482
Saving model with lowest validation loss: 2.2027


Epoch 8/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 8/100, Training Loss: 2.7836144062188954, Training Accuracy: 0.2871112298124876
Validation Loss: 2.0997103796555447, Validation Accuracy: 0.4107943077882131
Saving model with lowest validation loss: 2.0997


Epoch 9/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 9/100, Training Loss: 2.673574462915078, Training Accuracy: 0.30102237657858777
Validation Loss: 2.0257035356301527, Validation Accuracy: 0.44162511863769627
Saving model with lowest validation loss: 2.0257


Epoch 10/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:55<00:00,  1.12s/batch]


Epoch 10/100, Training Loss: 2.6357464347130213, Training Accuracy: 0.31089372627246076
Validation Loss: 1.979168564845354, Validation Accuracy: 0.42656398392640626
Saving model with lowest validation loss: 1.9792


Epoch 11/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 11/100, Training Loss: 2.5432940553396177, Training Accuracy: 0.3300614316876118
Validation Loss: 1.838409682114919, Validation Accuracy: 0.46291473775337905
Saving model with lowest validation loss: 1.8384


Epoch 12/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 12/100, Training Loss: 2.5085060680523896, Training Accuracy: 0.328677736222744
Validation Loss: 1.8763201152667022, Validation Accuracy: 0.45969106142337507
No improvement in validation loss for 1 epochs.


Epoch 13/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:55<00:00,  1.12s/batch]


Epoch 13/100, Training Loss: 2.4663032583701305, Training Accuracy: 0.33683523268271715
Validation Loss: 1.6805950433779986, Validation Accuracy: 0.5054272019710296
Saving model with lowest validation loss: 1.6806


Epoch 14/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 14/100, Training Loss: 2.4263244767983756, Training Accuracy: 0.34370919393423277
Validation Loss: 1.7365112740259905, Validation Accuracy: 0.4757723468236434
No improvement in validation loss for 1 epochs.


Epoch 15/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 15/100, Training Loss: 2.3793153869800077, Training Accuracy: 0.35622180673556453
Validation Loss: 1.6607557076674242, Validation Accuracy: 0.5089328109453886
Saving model with lowest validation loss: 1.6608


Epoch 16/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 16/100, Training Loss: 2.3444651395846634, Training Accuracy: 0.35435585830456173
Validation Loss: 1.592415147102796, Validation Accuracy: 0.5263161800610714
Saving model with lowest validation loss: 1.5924


Epoch 17/100: 100%|██████████| 156/156 [03:04<00:00,  1.18s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 17/100, Training Loss: 2.29237614151759, Training Accuracy: 0.36762152784145796
Validation Loss: 1.5985524891278682, Validation Accuracy: 0.5148348468236434
No improvement in validation loss for 1 epochs.


Epoch 18/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 18/100, Training Loss: 2.30149396107747, Training Accuracy: 0.3703444028894107
Validation Loss: 1.508853404185711, Validation Accuracy: 0.5473053180254422
Saving model with lowest validation loss: 1.5089


Epoch 19/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 19/100, Training Loss: 2.265436128163949, Training Accuracy: 0.3852423137197128
Validation Loss: 1.504692799005753, Validation Accuracy: 0.5447196998657324
Saving model with lowest validation loss: 1.5047


Epoch 20/100: 100%|██████████| 156/156 [03:04<00:00,  1.18s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 20/100, Training Loss: 2.2438136606644363, Training Accuracy: 0.38435941953689623
Validation Loss: 1.5145688779078996, Validation Accuracy: 0.5423232729618366
No improvement in validation loss for 1 epochs.


Epoch 21/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 21/100, Training Loss: 2.2252867030791745, Training Accuracy: 0.3802676875239763
Validation Loss: 1.4523618454352403, Validation Accuracy: 0.5546541133752236
Saving model with lowest validation loss: 1.4524


Epoch 22/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 22/100, Training Loss: 2.1951326391635795, Training Accuracy: 0.3921051460963029
Validation Loss: 1.3869920189564044, Validation Accuracy: 0.567956879352912
Saving model with lowest validation loss: 1.3870


Epoch 23/100: 100%|██████████| 156/156 [03:04<00:00,  1.18s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 23/100, Training Loss: 2.1697482810570645, Training Accuracy: 0.3955922067547456
Validation Loss: 1.4940255819222865, Validation Accuracy: 0.5454467890354303
No improvement in validation loss for 1 epochs.


Epoch 24/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 24/100, Training Loss: 2.1646610101064048, Training Accuracy: 0.39952813394558734
Validation Loss: 1.4380224263056731, Validation Accuracy: 0.5539604108303021
No improvement in validation loss for 2 epochs.


Epoch 25/100: 100%|██████████| 156/156 [03:02<00:00,  1.17s/batch]
Validating: 100%|██████████| 156/156 [02:53<00:00,  1.11s/batch]


Epoch 25/100, Training Loss: 2.14766075137334, Training Accuracy: 0.3965233262532797
Validation Loss: 1.395607822598555, Validation Accuracy: 0.5529328408913735
No improvement in validation loss for 3 epochs.


Epoch 26/100:   6%|▋         | 10/156 [00:13<02:52,  1.18s/batch]

In [None]:
# Plot the result
# Load the saved metrics
with open("/kaggle/working/metrics.json", "r") as f:
    metrics = json.load(f)

train_losses = metrics["train_losses"]
val_losses = metrics["val_losses"]
train_accuracies = metrics["train_accuracies"]
val_accuracies = metrics["val_accuracies"]

# Plot the metrics
epochs = range(1, len(train_losses) + 1)

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')

plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracies, label='Training Accuracy')
plt.plot(epochs, val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')

plt.tight_layout()
plt.show()

In [None]:
class VQAModel_trained(nn.Module):
    def __init__(self, num_answers):
        super(VQAModel_trained, self).__init__()
        # Image feature extractor
        self.cnn = models.resnet50(pretrained=True)
        self.cnn.fc = nn.Identity()  # Remove the final classification layer

        # Question feature extractor
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Fusion and final classification
        self.fc1 = nn.Linear(2048 + 768, 1024)
        self.fc2 = nn.Linear(1024, num_answers)
        self.dropout = nn.Dropout(0.5)

    def forward(self, images, input_ids, attention_mask):
        # Extract image features
        image_features = self.cnn(images)

        # Extract question features
        outputs = self.bert(input_ids, attention_mask)
        question_features = outputs.last_hidden_state[:, 0, :]  # shape: (batch_size, 768)

        # Concatenate features
        combined_features = torch.cat((image_features, question_features), dim=1)

        # Classification
        x = self.fc1(combined_features)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VQAModel_trained(num_answers=582)  # Adjust num_answers based on your dataset
model.to(device)
# Load the best model checkpoint
checkpoint_path = "/kaggle/working/best_model.pth"
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])

print("Best model loaded successfully.")

In [None]:
from torch.utils.data import DataLoader
from torchvision import transforms

# Example usage
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_dataset = VQADataset('/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/data_eval.csv', '/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/images', transform)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

print("Test data loader prepared successfully.")

In [None]:
test_loss, test_accuracy = evaluate_model(model, test_loader, criterion, device)

In [None]:
!pip -q install sentence-transformers

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from sentence_transformers import SentenceTransformer, util

# Load Sentence-BERT model
model_sbert = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to calculate similarity using Sentence-BERT
def sbert_similarity(sentence1, sentence2):
    embeddings1 = model_sbert.encode(sentence1, convert_to_tensor=True)
    embeddings2 = model_sbert.encode(sentence2, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    return cosine_scores.item()

In [None]:
def preprocess_answer(answer):
    # Remove underscores from the answer
    return answer.replace('_', ' ')

In [None]:
# Load answer space for mapping
with open(os.path.join("/kaggle/input/visual-question-answering-computer-vision-nlp/dataset/", "answer_space.txt")) as f:
    answer_space = f.read().splitlines()

model.eval()
test_losses = []
test_accuracies = []
similarities = []

criterion = nn.CrossEntropyLoss()

In [None]:
with torch.no_grad():
    for idx, (images, input_ids, attention_mask, labels) in enumerate(test_loader):
        images = images.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(images, input_ids, attention_mask)
        loss = criterion(outputs, labels)

        test_losses.append(loss.item())

        _, preds = torch.max(outputs, 1)
        accuracy = calculate_accuracy(outputs, labels)
        test_accuracies.append(accuracy)

        predicted_answer = preprocess_answer(answer_space[preds.item()])
        actual_answer = preprocess_answer(answer_space[labels.item()])

        similarity = sbert_similarity(predicted_answer, actual_answer)
        similarities.append(similarity)

        # Plotting the image with question and answers
        image = images.cpu().squeeze().permute(1, 2, 0).numpy()
        image = image * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])
        image = np.clip(image, 0, 1)

        plt.imshow(image)
        plt.title(f"Question: {test_dataset.data.iloc[idx]['question']}\nPredicted: {predicted_answer}\nActual: {actual_answer}\nSBERT Similarity: {similarity:.4f}")
        plt.axis('off')
        plt.show()

        if idx >= 49:  # Display only 50 samples
            break

# Print average similarity score
average_similarity = np.mean(similarities)
print(f"Average SBERT Similarity: {average_similarity:.4f}")

In [None]:
# Plot test accuracy and test loss
plt.figure(figsize=(14, 6))

# Test Loss
plt.subplot(1, 2, 1)
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Batch')
plt.ylabel('Loss')
plt.title('Test Loss')
plt.legend()

# Test Accuracy
plt.subplot(1, 2, 2)
plt.plot(test_accuracies, label='Test Accuracy')
plt.xlabel('Batch')
plt.ylabel('Accuracy')
plt.title('Test Accuracy')
plt.legend()

plt.show()

In [None]:
# Similarity
plt.figure(figsize=(14, 6))
plt.subplot(1, 1, 1)
plt.plot(similarities, label='SBERT Similarity')
plt.xlabel('Batch')
plt.ylabel('Similarity')
plt.title('SBERT Similarity')
plt.legend()
plt.show()