### importing required libraries

In [1]:
import json
import requests
import numpy as np

### Loading the images and annotations

In [2]:
# Directory for the images
image_directory = 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/'

# Directory for the annotations
annotation_directory= 'https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations/'

train_annotation_path = '{}{}'.format(annotation_directory, 'train.json')
val_annotation_path = '{}{}'.format(annotation_directory, 'val.json')
test_annotation_path = '{}{}'.format(annotation_directory, 'test.json')

In [3]:
# Train annotations
train_data = requests.get(train_annotation_path, allow_redirects=True)
train_data_n = train_data.json()

# Validation annotations
val_data = requests.get(val_annotation_path, allow_redirects=True)
val_data_n = val_data.json()

# Test annotations
test_data = requests.get(test_annotation_path, allow_redirects=True)
test_data_n = test_data.json()

print('Train set size:', len(train_data_n))
print('Validation set size:', len(val_data_n))
print('Test set size:', len(test_data_n))

Train set size: 20523
Validation set size: 4319
Test set size: 8000


In [None]:
top_100_image_urls = [
    image_directory + sample["image"]
    for sample in test_data_n[:100]
]

In [11]:
import random
# splitting the data 
def train_val_split(data,size):
    data = list(data)

    class_1 = [sample for sample in data if sample['answerable'] == 1]
    class_0 = [sample for sample in data if sample['answerable'] == 0]

    available_1, available_0 = len(class_1), len(class_0)

    num_samples_per_class = min(size // 2, available_1, available_0)

    sampled_class_1 = random.sample(class_1,num_samples_per_class)
    sampled_class_0 = random.sample(class_0, num_samples_per_class)

    balanced_data = sampled_class_1 + sampled_class_0
    random.shuffle(balanced_data)

    return balanced_data

In [12]:
# Taking 6000 samples from the train data
train_data = train_data_n[:2000]

# Taking 300 samples from the val data
val_data = val_data_n[:300]

# Taking 100 samples from the test data
test_data = test_data_n[:100]

In [13]:
# Taking 6000 samples from the train data
train_data_ac = train_val_split(train_data_n,2000)

# Taking 300 samples from the val data
val_data_ac = train_val_split(val_data_n,300)

# Taking 100 samples from the test data
test_data_ac = test_data_n[:100]

## Data Preorocessing

### Preprocessing the images

In [19]:
import requests
from PIL import Image
from io import BytesIO
import torch 
from torchvision import transforms

def Image_tensors(data):
    transform = transforms.Compose([
    # Resizing the image
    transforms.Resize((128,128)),
    # Converting the images to tensor 
    transforms.ToTensor(),
    # Applying the Normalization and Standardization
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Applying the above defined transformations to all the images

    # All the transformed images are stored in image_tensors list
    image_tensors = []
    for images in data:
        image_url = image_directory + images["image"]
        # making the HTTP get request to fetch the image which returns the binary image data
        response = requests.get(image_url)
        # converting the binary image data into the image
        img = Image.open(BytesIO(response.content))
        img_preprocessed = transform(img)
        image_tensors.append(img_preprocessed)
    image_tensors = torch.stack(image_tensors)
    return image_tensors



### Preprocessing the questions

In [14]:
import spacy
import re
import contractions
import torch

# Loading the  Spacy tokenizer
nlp = spacy.load("en_core_web_sm")

def preprocess_question(question):
    # converting the questions to lower case
    question = question.lower()
    # expanding the contractions : what's -> what is
    question = contractions.fix(question)
    # removing the special characters 
    question = re.sub(r"[^a-zA-Z\s]", "", question)
    doc = nlp(question)
    # applying the lemmatization and removing the stop words and punctuations
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return tokens

def get_glove_embedding_for_question(question, glove_dict, embedding_dim=50):
    tokens = preprocess_question(question)
    vectors = [glove_dict[token] for token in tokens if token in glove_dict]

    if len(vectors) == 0:
        return torch.zeros(embedding_dim)
    else:
        return torch.mean(torch.stack(vectors), dim=0)

def Question_Tensors(data, glove_dict, embedding_dim=50):
    questions = [sample["question"] for sample in data]
    embeddings = [get_glove_embedding_for_question(q, glove_dict, embedding_dim) for q in questions]
    question_tensor = torch.stack(embeddings)
    return question_tensor



In [15]:
import os
import torch

def load_glove_embeddings(path='glove.6B.50d.txt'):
    if not os.path.exists(path):
        raise FileNotFoundError(f"GloVe file is not found at the path: {path}")
    
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            try:
                vector = torch.tensor([float(v) for v in values[1:]], dtype=torch.float32)
                embeddings[word] = vector
            except ValueError:
                print(f"Skipping the corrupted line: {line[:50]}...")
                continue
    return embeddings

In [37]:
# choosing the top 300 answers, assigning the id's to them and treating remaning answers as other_categories 
from collections import Counter
top_n = 300
def choosen_answers(data):
    chosen_answers = []

    # Choosing the most common answer from the ten lables for each sample
    for sample in data:
        answers = [entry['answer'] for entry in sample['answers']]
        answer_counts = Counter(answers) 
        top_answer, _ = answer_counts.most_common(1)[0] 
        chosen_answers.append(top_answer)


    answer_counts = Counter(chosen_answers)
    top_answers = answer_counts.most_common(top_n) 

 

    # Create categories for top n
    category_name2id = {answer:ind for ind, (answer, _) in enumerate(top_answers)}

    category_id2name = {ind:answer for ind, (answer, _) in enumerate(top_answers)}
    

    
    category_id2name[top_n] = 'other_categories'
    return chosen_answers,category_id2name,category_name2id


In [17]:

def Target_tensors(chosen_answers,category_name2id):
    targets = []

    for ans in chosen_answers:
        if ans in category_name2id.keys():
            targets.append(category_name2id[ans])
        else:
            targets.append(top_n) 

    targets_tensor = torch.tensor(targets)
    return targets_tensor



In [140]:
import json
import torch
from torch.utils.data import TensorDataset, DataLoader

#train_image_tensors_ac = Image_tensors(train_data_ac)
#val_image_tensors_ac = Image_tensors(val_data_ac)
#test_image_tensors_ac = Image_tensors(test_data_ac)

#torch.save(train_image_tensors_ac, "train_image_tensors_ac.pt")
#torch.save(val_image_tensors_ac, "val_image_tensors_ac.pt")
#torch.save(test_image_tensors_ac, "test_image_tensors_ac.pt")



glove_dict = load_glove_embeddings("glove.6B.50d.txt")
embedding_dim = 50


train_question_tensors_ac = Question_Tensors(train_data_ac, glove_dict, embedding_dim)
val_question_tensors_ac = Question_Tensors(val_data_ac, glove_dict, embedding_dim)
test_question_tensors_ac = Question_Tensors(test_data_ac, glove_dict, embedding_dim)

torch.save(train_question_tensors_ac, "train_question_tensors_ac.pt")
torch.save(val_question_tensors_ac, "val_question_tensors_ac.pt")
torch.save(test_question_tensors_ac, "test_question_tensors_ac.pt")


train_image_tensors_ac = torch.load("train_image_tensors_ac.pt")
val_image_tensors_ac= torch.load("val_image_tensors_ac.pt")
test_image_tensors_ac= torch.load("test_image_tensors_ac.pt")

train_question_tensors_ac = torch.load("train_question_tensors_ac.pt")
val_question_tensors_ac= torch.load("val_question_tensors_ac.pt")
test_question_tensors_ac= torch.load("test_question_tensors_ac.pt")

def get_answerability_labels(data):
    return torch.tensor([sample["answerable"] for sample in data], dtype=torch.float32)


train_labels = get_answerability_labels(train_data_ac)
val_labels = get_answerability_labels(val_data_ac)


train_dataset_ac = TensorDataset(train_image_tensors_ac,train_question_tensors_ac,train_labels)
val_dataset_ac = TensorDataset(val_image_tensors_ac,val_question_tensors_ac,val_labels)
test_dataset_ac = TensorDataset(test_image_tensors_ac,test_question_tensors_ac)

##  Answerable Classifier 

In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CrossAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(CrossAttention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, batch_first=True)

    def forward(self, query, key, value):
        attn_output, _ = self.attention(query, key, value)
        return attn_output

class answerclassifier(nn.Module):
    def __init__(self,num_classes=1, hidden_dim=512, dropout_prob=0.1, num_heads=4):
        super(answerclassifier, self).__init__()

        # Image feature extractor
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(256)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(512)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        self.fc_image = nn.Linear(512 * 8 * 8, hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)

        # Question feature extractor
        self.fc_question = nn.Linear(50, hidden_dim)
  

        # Cross attention
        self.cross_attention = CrossAttention(embed_dim=hidden_dim, num_heads=num_heads)

        # Fusion + classification
        self.fc_fusion = nn.Linear(hidden_dim, hidden_dim)
        self.fc_output = nn.Linear(hidden_dim, num_classes)

    def forward(self, image, question_embedding):
        # Image pipeline
        x = self.pool(F.relu(self.bn1(self.conv1(image))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = self.pool(F.relu(self.bn4(self.conv4(x))))

        x = x.view(x.size(0), -1)
        image_features = self.dropout(F.relu(self.fc_image(x)))

        # Question pipeline (assuming question is already an embedding)
        question_features = self.dropout(F.relu(self.fc_question(question_embedding)))

        # Adding the sequence dim for cross-attention
        image_seq = image_features.unsqueeze(1)  
        question_seq = question_features.unsqueeze(1)  

        attended_features = self.cross_attention(query=image_seq, key=question_seq, value=question_seq)
        attended_features = attended_features.squeeze(1)

        # Fusion and classification
        fused = self.dropout(F.relu(self.fc_fusion(attended_features)))
        output = self.fc_output(fused)
        return output.squeeze(-1)


## Training the Model

In [142]:
# Trained the model and performing the hyperparameter tuning using validation data

import optuna
import torch.optim as optim
# Initializing the  Model, Loss, Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

global_best_model_state = None
global_best_accuracy = 0  

def optimization(trial):
    global global_best_model_state, global_best_accuracy 
    print(f"Running Trail {trial.number} \n")
    # defining different hyperparameters
    learning_rate = trial.suggest_loguniform("learning_rate",1e-3, 5e-3)
    batch_size = trial.suggest_categorical("batch_size",[32,64])
    hidden_dimension= trial.suggest_categorical("hidden_dimension",[128,256,512])
    optimizers = trial.suggest_categorical("optimizer",["Adam","SGD","AdamW"])
    
    print(f"Selected Hyperparameters: lr={learning_rate}, batch_size={batch_size}, hidden_dim={hidden_dimension}, optimizer={optimizers}")


    # defining the model
    model = answerclassifier(1,hidden_dim=hidden_dimension,dropout_prob=0.1,num_heads=4).to(device)

    criterion = nn.BCEWithLogitsLoss()

    # initializing the selected optimizer
    if optimizers =="Adam":
        optimizer = optim.Adam(model.parameters(),lr=learning_rate,weight_decay=1e-4)
    elif optimizers =="SGD":
        optimizer = optim.SGD(model.parameters(),lr=learning_rate,momentum=0.9)
    elif optimizers =="AdamW":
        optimizer = optim.AdamW(model.parameters(),lr=learning_rate,weight_decay=1e-4)
    
    # loading data for different batch sizes
    train_loader_ac = DataLoader(train_dataset_ac, batch_size=batch_size, shuffle=True)
    val_loader_ac = DataLoader(val_dataset_ac, batch_size=batch_size, shuffle=False)

    best_validation_accuracy = 0
    best_model_state = None
    # Training for five epochs
    for epoch in range(5):
        model.train()
        train_loss =0

        # training by batch wise
        for images,questions,labels in train_loader_ac:
            images,questions,labels = images.to(device), questions.to(device), labels.to(device)

            outputs = model(images, questions).squeeze(dim=-1)
            loss = criterion(outputs,labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss+=loss.item()

        # Validation
        model.eval()
        val_loss =0
        correct, total = 0,0

        with torch.no_grad():
            for images,questions,labels in val_loader_ac:
                images,questions,labels = images.to(device), questions.to(device), labels.to(device)
                outputs = model(images, questions).squeeze(dim=-1)
                loss = criterion(outputs,labels)

                val_loss+=loss.item()
                # coverting the output logits into 0's and 1's 
                predicted = torch.sigmoid(outputs).round()
           
                correct+= (predicted==labels).sum().item()
               
                total+= labels.size(0)
            

        # calculating the training and validation loss per each batch 
        average_train_loss = train_loss / len(train_loader_ac)
        average_val_loss = val_loss / len(val_loader_ac)

        # calculating the validation accuracy
        validation_accuracy = correct/total
        print(f"Epoch [{epoch+1}/{5}] - Train Loss: {average_train_loss:.4f}, Val Loss: {average_val_loss:.4f}, Val Acc: {validation_accuracy:.4f}")

        if validation_accuracy > best_validation_accuracy:
            best_validation_accuracy = validation_accuracy
            best_model_state = model.state_dict()

        # Early Stopping if Validation accuracy is Not Improving
        # reporting the loss for pruning
        trial.report(validation_accuracy,epoch)

        # Stopping the epoch loop early if the validation is not improving much
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    print(f"Trial {trial.number} Completed!  Val Acc: {best_validation_accuracy:.4f}\n")
    if best_validation_accuracy > global_best_accuracy:
        global_best_accuracy = best_validation_accuracy
        global_best_model_state = best_model_state
    # returning the validation loss
    return best_validation_accuracy

# Running the hyperparameter tuning with optuna
# aim is to minimize the validation loss
study = optuna.create_study(direction="maximize")
# Running 10 hyperparameter trials
study.optimize(optimization,n_trials=10)

print("Best Hyperparameters:",study.best_params)


# Saving the Best Model
if global_best_model_state is not None:
    torch.save(global_best_model_state, "best_multimodal_classifier.pth")
    print("Best model saved successfully!")
else:
    print("No best model found to save.")


[I 2025-03-20 22:32:09,174] A new study created in memory with name: no-name-82016b18-0385-4c92-b303-5229be17a7a9
  learning_rate = trial.suggest_loguniform("learning_rate",1e-3, 5e-3)


Running Trail 0 

Selected Hyperparameters: lr=0.002523507274395066, batch_size=32, hidden_dim=512, optimizer=AdamW
Epoch [1/5] - Train Loss: 0.6784, Val Loss: 0.6746, Val Acc: 0.5867
Epoch [2/5] - Train Loss: 0.6727, Val Loss: 0.6619, Val Acc: 0.6000
Epoch [3/5] - Train Loss: 0.6509, Val Loss: 0.6727, Val Acc: 0.5867
Epoch [4/5] - Train Loss: 0.6556, Val Loss: 0.6691, Val Acc: 0.5867


[I 2025-03-20 22:35:25,370] Trial 0 finished with value: 0.6 and parameters: {'learning_rate': 0.002523507274395066, 'batch_size': 32, 'hidden_dimension': 512, 'optimizer': 'AdamW'}. Best is trial 0 with value: 0.6.


Epoch [5/5] - Train Loss: 0.6341, Val Loss: 0.6604, Val Acc: 0.6000
Trial 0 Completed!  Val Acc: 0.6000

Running Trail 1 

Selected Hyperparameters: lr=0.0016384618842067798, batch_size=32, hidden_dim=256, optimizer=Adam
Epoch [1/5] - Train Loss: 0.6684, Val Loss: 0.6564, Val Acc: 0.5833
Epoch [2/5] - Train Loss: 0.6461, Val Loss: 0.6629, Val Acc: 0.5900
Epoch [3/5] - Train Loss: 0.6393, Val Loss: 0.6796, Val Acc: 0.5967
Epoch [4/5] - Train Loss: 0.6339, Val Loss: 0.6774, Val Acc: 0.5667


[I 2025-03-20 22:38:35,292] Trial 1 finished with value: 0.5966666666666667 and parameters: {'learning_rate': 0.0016384618842067798, 'batch_size': 32, 'hidden_dimension': 256, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.6.


Epoch [5/5] - Train Loss: 0.6318, Val Loss: 0.6796, Val Acc: 0.5933
Trial 1 Completed!  Val Acc: 0.5967

Running Trail 2 

Selected Hyperparameters: lr=0.00483184599739595, batch_size=32, hidden_dim=512, optimizer=Adam
Epoch [1/5] - Train Loss: 0.7360, Val Loss: 0.7365, Val Acc: 0.4467
Epoch [2/5] - Train Loss: 0.6959, Val Loss: 0.6743, Val Acc: 0.5900
Epoch [3/5] - Train Loss: 0.6728, Val Loss: 0.6802, Val Acc: 0.5367
Epoch [4/5] - Train Loss: 0.6507, Val Loss: 0.6390, Val Acc: 0.6167


[I 2025-03-20 22:41:51,995] Trial 2 finished with value: 0.6166666666666667 and parameters: {'learning_rate': 0.00483184599739595, 'batch_size': 32, 'hidden_dimension': 512, 'optimizer': 'Adam'}. Best is trial 2 with value: 0.6166666666666667.


Epoch [5/5] - Train Loss: 0.6663, Val Loss: 0.6749, Val Acc: 0.5900
Trial 2 Completed!  Val Acc: 0.6167

Running Trail 3 

Selected Hyperparameters: lr=0.0016390249952238879, batch_size=64, hidden_dim=256, optimizer=AdamW
Epoch [1/5] - Train Loss: 0.6746, Val Loss: 0.6531, Val Acc: 0.6067
Epoch [2/5] - Train Loss: 0.6476, Val Loss: 0.6579, Val Acc: 0.6033
Epoch [3/5] - Train Loss: 0.6403, Val Loss: 0.6505, Val Acc: 0.5900
Epoch [4/5] - Train Loss: 0.6326, Val Loss: 0.6515, Val Acc: 0.6133


[I 2025-03-20 22:44:59,773] Trial 3 finished with value: 0.6133333333333333 and parameters: {'learning_rate': 0.0016390249952238879, 'batch_size': 64, 'hidden_dimension': 256, 'optimizer': 'AdamW'}. Best is trial 2 with value: 0.6166666666666667.


Epoch [5/5] - Train Loss: 0.6211, Val Loss: 0.6707, Val Acc: 0.5433
Trial 3 Completed!  Val Acc: 0.6133

Running Trail 4 

Selected Hyperparameters: lr=0.001195410810510843, batch_size=32, hidden_dim=512, optimizer=Adam
Epoch [1/5] - Train Loss: 0.6785, Val Loss: 0.6517, Val Acc: 0.6067
Epoch [2/5] - Train Loss: 0.6495, Val Loss: 0.6533, Val Acc: 0.6033
Epoch [3/5] - Train Loss: 0.6389, Val Loss: 0.6680, Val Acc: 0.6100
Epoch [4/5] - Train Loss: 0.6260, Val Loss: 0.6561, Val Acc: 0.6167


[I 2025-03-20 22:48:12,745] Trial 4 finished with value: 0.6166666666666667 and parameters: {'learning_rate': 0.001195410810510843, 'batch_size': 32, 'hidden_dimension': 512, 'optimizer': 'Adam'}. Best is trial 2 with value: 0.6166666666666667.


Epoch [5/5] - Train Loss: 0.6141, Val Loss: 0.7073, Val Acc: 0.5933
Trial 4 Completed!  Val Acc: 0.6167

Running Trail 5 

Selected Hyperparameters: lr=0.004202118479167341, batch_size=64, hidden_dim=512, optimizer=SGD


[I 2025-03-20 22:48:49,710] Trial 5 pruned. 


Epoch [1/5] - Train Loss: 0.6933, Val Loss: 0.6925, Val Acc: 0.5000
Running Trail 6 

Selected Hyperparameters: lr=0.0037993259355611733, batch_size=32, hidden_dim=256, optimizer=SGD


[I 2025-03-20 22:49:26,489] Trial 6 pruned. 


Epoch [1/5] - Train Loss: 0.6933, Val Loss: 0.6918, Val Acc: 0.4967
Running Trail 7 

Selected Hyperparameters: lr=0.00470294949561668, batch_size=32, hidden_dim=256, optimizer=Adam


[I 2025-03-20 22:50:04,915] Trial 7 pruned. 


Epoch [1/5] - Train Loss: 0.6761, Val Loss: 0.6889, Val Acc: 0.5600
Running Trail 8 

Selected Hyperparameters: lr=0.0035005803719305026, batch_size=32, hidden_dim=512, optimizer=Adam


[I 2025-03-20 22:50:44,264] Trial 8 pruned. 


Epoch [1/5] - Train Loss: 0.6936, Val Loss: 0.6737, Val Acc: 0.5733
Running Trail 9 

Selected Hyperparameters: lr=0.002426924100418565, batch_size=64, hidden_dim=512, optimizer=Adam
Epoch [1/5] - Train Loss: 0.6773, Val Loss: 0.6560, Val Acc: 0.5967


[I 2025-03-20 22:51:57,940] Trial 9 pruned. 


Epoch [2/5] - Train Loss: 0.6557, Val Loss: 0.6612, Val Acc: 0.5967
Best Hyperparameters: {'learning_rate': 0.00483184599739595, 'batch_size': 32, 'hidden_dimension': 512, 'optimizer': 'Adam'}
Best model saved successfully!


## Testing with the best Model

In [144]:
best_model = answerclassifier(
    num_classes=1,
    hidden_dim=study.best_params["hidden_dimension"],
    dropout_prob=0.1,
    num_heads=4
).to(device)

best_model.load_state_dict(torch.load("best_multimodal_classifier.pth"))
best_model.eval()

test_loader_ac = DataLoader(test_dataset_ac, study.best_params["batch_size"], shuffle=False)

predictions = []

with torch.no_grad():
    for images, questions in test_loader_ac:
        images, questions= images.to(device), questions.to(device)

        outputs = best_model(images, questions).squeeze(dim=-1)
        preds = torch.sigmoid(outputs).round().int()
        predictions.append(preds.cpu()) 

       
final_preds = torch.cat(predictions)


In [146]:
torch.save(final_preds, "Manidatta_Anumandla_challenge1.pkl")

## Answer Prediction

In [22]:
from torch.utils.data import TensorDataset, DataLoader
train_choosen_answers,train_category_id2name,train_category_name2id = choosen_answers(train_data)
val_choosen_answers,val_category_id2name,val_category_name2id = choosen_answers(val_data)

train_target_tensors = Target_tensors(train_choosen_answers,train_category_name2id)
val_target_tensors = Target_tensors(val_choosen_answers,val_category_name2id)
torch.save(train_target_tensors, "train_target_tensors.pt")
torch.save(val_target_tensors, "val_target_tensors.pt")

glove_dict = load_glove_embeddings("glove.6B.50d.txt")
embedding_dim = 50

train_question_tensors = Question_Tensors(train_data, glove_dict, embedding_dim)
val_question_tensors = Question_Tensors(val_data, glove_dict, embedding_dim)
test_question_tensors = Question_Tensors(test_data, glove_dict, embedding_dim)

torch.save(train_question_tensors, "train_question_tensors.pt")
torch.save(val_question_tensors, "val_question_tensors.pt")#torch.save(test_question_tensors, "test_question_tensors.pt")

#train_image_tensors = Image_tensors(train_data)
#val_image_tensors = Image_tensors(val_data)
#test_image_tensors = Image_tensors(test_data)

#torch.save(train_image_tensors, "train_image_tensors.pt")
#torch.save(val_image_tensors, "val_image_tensors.pt")
#torch.save(test_image_tensors, "test_image_tensors.pt")


train_image_tensors=torch.load("train_image_tensors.pt")
val_image_tensors=torch.load("val_image_tensors.pt")
test_image_tensors=torch.load("test_image_tensors.pt")

import json

# Create a dictionary with all required variables
data_to_save = {
    "train_choosen_answers": train_choosen_answers,
    "train_category_id2name": train_category_id2name,
    "train_category_name2id": train_category_name2id,
    "val_choosen_answers": val_choosen_answers,
    "val_category_id2name": val_category_id2name,
    "val_category_name2id": val_category_name2id,
}

# Save to a JSON file
with open("category_mappings.json", "w") as f:
    json.dump(data_to_save, f)

In [23]:
train_dataset = TensorDataset(train_image_tensors,train_question_tensors,train_target_tensors)
val_dataset = TensorDataset(val_image_tensors,val_question_tensors,val_target_tensors)
test_dataset = TensorDataset(test_image_tensors,test_question_tensors)

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

class CrossAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(CrossAttention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, batch_first=True)

    def forward(self, query, key, value):
        attn_output, _ = self.attention(query, key, value)
        return attn_output

class answergeneration(nn.Module):
    def __init__(self, num_classes=301, hidden_dim=512, dropout_prob=0.1, num_heads=4):
        super(answergeneration, self).__init__()

        # Image feature extractor using ResNet18
        self.resnet = models.resnet18(pretrained=True)
        for param in list(self.resnet.parameters())[:-4]:  # freezing the earlier layers
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, hidden_dim)

        self.fc_image1 = nn.Linear(hidden_dim, hidden_dim * 2)
        self.fc_image2 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)

        # Question feature extractor (GloVe embedding dim is 50)
        self.fc_question1 = nn.Linear(50, hidden_dim * 2)
        self.fc_question2 = nn.Linear(hidden_dim * 2, hidden_dim)

        # Cross attention
        self.cross_attention = CrossAttention(embed_dim=hidden_dim, num_heads=num_heads)

        # Fusion + classification
        self.fc_fusion1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.fc_fusion2 = nn.Linear(hidden_dim, hidden_dim)
        self.norm2 = nn.LayerNorm(hidden_dim)
        self.fc_output = nn.Linear(hidden_dim, num_classes)

    def forward(self, image, question_embedding):
        # Image pipeline
        x = self.resnet(image)  # output shape: [batch_size, hidden_dim]
        image_features = self.dropout(F.relu(self.fc_image1(x)))
        image_features = self.dropout(F.relu(self.fc_image2(image_features)))

        # Question pipeline (mean of GloVe embeddings per question)
        question_features = self.dropout(F.relu(self.fc_question1(question_embedding)))
        question_features = self.dropout(F.relu(self.fc_question2(question_features)))

        # Add sequence dim for cross-attention
        image_seq = image_features.unsqueeze(1)  
        question_seq = question_features.unsqueeze(1)  

        attended_features = self.cross_attention(query=image_seq, key=question_seq, value=question_seq)
        attended_features = attended_features.squeeze(1)

        # Fusion and classification
        fusion_input = torch.cat((attended_features, image_features), dim=1)
        fused = self.dropout(F.relu(self.norm1(self.fc_fusion1(fusion_input))))
        fused = self.dropout(F.relu(self.norm2(self.fc_fusion2(fused))))
        output = self.fc_output(fused)
        return output.squeeze(-1)

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
def initialize_weights(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
        nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            nn.init.zeros_(m.bias)


In [27]:
from collections import Counter
import numpy as np
def evaluate_predictions(predicted_answers,batch_true_answers):
    accuracies = []
    for predicted_answer, true_answers in zip(predicted_answers, batch_true_answers):
        predicted_answer = predicted_answer.lower()
        extract_true_answers = [answer['answer'].lower() for answer in true_answers['answers']]
        answer_counts = Counter(extract_true_answers)
        answer_match= answer_counts.get(predicted_answer, 0)
        

        accuracy = min(answer_match / 3, 1)
        accuracies.append(accuracy)
    return np.mean(accuracies)  if accuracies else 0.0

## Training the Model

In [32]:
import optuna
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.utils.class_weight import compute_class_weight


global_best_model_state = None
global_best_accuracy = 0 

def optimizer(trial):
    global global_best_model_state, global_best_accuracy
    print(f"\n Running Trial {trial.number + 1} ")

    # Suggesting the optimizer
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-3, 5e-3)
    batch_size = trial.suggest_categorical("batch_size", [16,32])
    num_heads = trial.suggest_categorical("num_heads", [2,4])
    hidden_size = trial.suggest_categorical("hidden_dimension", [256, 512])
    optimizers = trial.suggest_categorical("optimizers", ["Adam","SGD","AdamW"])

    print(f"Selected Hyperparameters: lr={learning_rate}, batch_size={batch_size}, heads={num_heads}, hidden_dim={hidden_size}, optimizer={optimizers}")

    # Initializing the Model
    model = answergeneration(301,hidden_size,dropout_prob=0.1,num_heads=num_heads).to(device)

    model.apply(initialize_weights)
    #class_weights = get_class_weights(train_target_tensors).to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    # Choosing the Optimizer
    if optimizers == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=learning_rate,weight_decay=1e-4)
    elif optimizers == "AdamW":
        optimizer = optim.AdamW(model.parameters(), lr=learning_rate,weight_decay=1e-4)
    else:
        optimizer = optim.SGD(model.parameters(), lr=learning_rate,momentum=0.9)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    best_validation_accuracy = 0
    best_model_state = None 
    # Training loop for 5 epochs in each hyperparameter tuning trial
    num_epochs = 5
    for epoch in range(num_epochs):
        # passing the model into training mode
        model.train()
        train_loss = 0
        train_accuracy =0
        for batch_idx, (images, questions, answers) in enumerate(train_loader):
            images, questions, answers = images.to(device), questions.to(device), answers.to(device)
            optimizer.zero_grad()
            output = model(images, questions)
           
            loss = criterion(output, answers.long())
            train_loss+= loss.item() 
            loss.backward()
            optimizer.step()

            _, predicted = torch.max(output, dim=1)  
            predicted = predicted.cpu().numpy()
            predicted_answers = [train_category_id2name[pred] for pred in predicted if pred in train_category_id2name]
            train_batch_accuracy = evaluate_predictions(predicted_answers,train_data[batch_idx * answers.size(0) : (batch_idx + 1) * answers.size(0)])
            train_accuracy += train_batch_accuracy 
            

        train_loss /= len(train_loader)
        train_accuracy = train_accuracy / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] - Training Loss: {train_loss:.4f} - Train accuracy:{train_accuracy:.2f}")

        # Validating the model using validation data
        # passing the model into evaluation mode
        model.eval()
        val_loss = 0
        val_accuracy =0
        # running the loop with any gradients calcualtion
        with torch.no_grad():
            for batch_idx, (images, questions, answers) in enumerate(val_loader):
                images, questions, answers = images.to(device),questions.to(device),answers.to(device)
                output = model(images, questions)
                loss = criterion(output, answers.long())
                val_loss += loss.item()

                _,predicted_labels = torch.max(output, dim=1)
                predicted_labels = predicted_labels.cpu().numpy()
                predicted_answers = [val_category_id2name[pred] for pred in predicted_labels if pred in val_category_id2name]
                val_batch_accuracy = evaluate_predictions(predicted_answers,val_data[batch_idx * answers.size(0) : (batch_idx + 1) * answers.size(0)])
                val_accuracy += val_batch_accuracy 
        val_loss /= len(val_loader)
        val_accuracy = val_accuracy /len(val_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] - validation Loss: {val_loss:.4f} - validation accuracy:{val_accuracy:.2f}")
        if val_accuracy > best_validation_accuracy:
            best_validation_accuracy = val_accuracy
            best_model_state = model.state_dict()
        # Early Stopping & Pruning
        trial.report(val_loss, epoch)
        if trial.should_prune():
            print(" Trial Pruned Due to No Improvement!")
            raise optuna.exceptions.TrialPruned()

    print(f"Trial {trial.number + 1} Completed! Final Validation Accuracy: {val_accuracy :.4f}\n")
    if best_validation_accuracy > global_best_accuracy:
        global_best_accuracy = best_validation_accuracy
        global_best_model_state = best_model_state
    
    return best_validation_accuracy

# Running the Optuna Hyperparameter Tuning aiming to minimize the validation loss
study = optuna.create_study(direction="maximize") 
# running the tuning for ten 10 which takes different combination of hyperparameters each time
study.optimize(optimizer, n_trials=5) 

# Best Hyperparameters
print("\n Best Hyperparameters Found:")
print(study.best_params)

# Saving the Best Model
if global_best_model_state is not None:
    torch.save(global_best_model_state, "best_multimodal_answerable.pth")
    print("Best model saved successfully!")
else:
    print("No best model found to save.")

[I 2025-03-21 12:06:56,585] A new study created in memory with name: no-name-4ae57487-cb44-4399-af47-5afee1706e45
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-3, 5e-3)



 Running Trial 1 
Selected Hyperparameters: lr=0.004113372990308278, batch_size=32, heads=2, hidden_dim=256, optimizer=Adam
Epoch [1/5] - Training Loss: 3.5780 - Train accuracy:0.12
Epoch [1/5] - validation Loss: 3.5729 - validation accuracy:0.00
Epoch [2/5] - Training Loss: 3.1932 - Train accuracy:0.10
Epoch [2/5] - validation Loss: 3.5217 - validation accuracy:0.00
Epoch [3/5] - Training Loss: 3.0315 - Train accuracy:0.09
Epoch [3/5] - validation Loss: 3.4593 - validation accuracy:0.00
Epoch [4/5] - Training Loss: 2.9580 - Train accuracy:0.08
Epoch [4/5] - validation Loss: 3.3497 - validation accuracy:0.40
Epoch [5/5] - Training Loss: 2.9272 - Train accuracy:0.10


[I 2025-03-21 12:08:30,014] Trial 0 finished with value: 0.4034722222222221 and parameters: {'learning_rate': 0.004113372990308278, 'batch_size': 32, 'num_heads': 2, 'hidden_dimension': 256, 'optimizers': 'Adam'}. Best is trial 0 with value: 0.4034722222222221.


Epoch [5/5] - validation Loss: 3.5391 - validation accuracy:0.20
Trial 1 Completed! Final Validation Accuracy: 0.1983


 Running Trial 2 
Selected Hyperparameters: lr=0.0024903564969056846, batch_size=16, heads=4, hidden_dim=512, optimizer=Adam
Epoch [1/5] - Training Loss: 3.7087 - Train accuracy:0.13
Epoch [1/5] - validation Loss: 3.3204 - validation accuracy:0.55
Epoch [2/5] - Training Loss: 3.2267 - Train accuracy:0.09
Epoch [2/5] - validation Loss: 3.5874 - validation accuracy:0.00
Epoch [3/5] - Training Loss: 3.0710 - Train accuracy:0.09
Epoch [3/5] - validation Loss: 3.4517 - validation accuracy:0.00
Epoch [4/5] - Training Loss: 2.9890 - Train accuracy:0.08
Epoch [4/5] - validation Loss: 3.4728 - validation accuracy:0.01
Epoch [5/5] - Training Loss: 2.9412 - Train accuracy:0.08


[I 2025-03-21 12:10:21,003] Trial 1 finished with value: 0.5533625730994153 and parameters: {'learning_rate': 0.0024903564969056846, 'batch_size': 16, 'num_heads': 4, 'hidden_dimension': 512, 'optimizers': 'Adam'}. Best is trial 1 with value: 0.5533625730994153.


Epoch [5/5] - validation Loss: 3.6216 - validation accuracy:0.32
Trial 2 Completed! Final Validation Accuracy: 0.3194


 Running Trial 3 
Selected Hyperparameters: lr=0.0010586066394518549, batch_size=32, heads=2, hidden_dim=512, optimizer=Adam
Epoch [1/5] - Training Loss: 3.5113 - Train accuracy:0.14
Epoch [1/5] - validation Loss: 3.3674 - validation accuracy:0.39
Epoch [2/5] - Training Loss: 3.1281 - Train accuracy:0.10
Epoch [2/5] - validation Loss: 3.5858 - validation accuracy:0.00
Epoch [3/5] - Training Loss: 3.0250 - Train accuracy:0.09
Epoch [3/5] - validation Loss: 3.3818 - validation accuracy:0.01
Epoch [4/5] - Training Loss: 2.9491 - Train accuracy:0.11
Epoch [4/5] - validation Loss: 3.4187 - validation accuracy:0.29
Epoch [5/5] - Training Loss: 2.8881 - Train accuracy:0.11


[I 2025-03-21 12:11:53,822] Trial 2 finished with value: 0.3868055555555555 and parameters: {'learning_rate': 0.0010586066394518549, 'batch_size': 32, 'num_heads': 2, 'hidden_dimension': 512, 'optimizers': 'Adam'}. Best is trial 1 with value: 0.5533625730994153.


Epoch [5/5] - validation Loss: 3.4606 - validation accuracy:0.28
Trial 3 Completed! Final Validation Accuracy: 0.2774


 Running Trial 4 
Selected Hyperparameters: lr=0.0011997741872213214, batch_size=16, heads=4, hidden_dim=256, optimizer=SGD
Epoch [1/5] - Training Loss: 3.7736 - Train accuracy:0.12
Epoch [1/5] - validation Loss: 3.5471 - validation accuracy:0.41
Epoch [2/5] - Training Loss: 3.2711 - Train accuracy:0.11
Epoch [2/5] - validation Loss: 3.5703 - validation accuracy:0.14
Epoch [3/5] - Training Loss: 3.1864 - Train accuracy:0.10
Epoch [3/5] - validation Loss: 3.4069 - validation accuracy:0.23
Epoch [4/5] - Training Loss: 3.1367 - Train accuracy:0.10
Epoch [4/5] - validation Loss: 3.5588 - validation accuracy:0.01
Epoch [5/5] - Training Loss: 3.0922 - Train accuracy:0.12


[I 2025-03-21 12:13:40,923] Trial 3 finished with value: 0.4097222222222222 and parameters: {'learning_rate': 0.0011997741872213214, 'batch_size': 16, 'num_heads': 4, 'hidden_dimension': 256, 'optimizers': 'SGD'}. Best is trial 1 with value: 0.5533625730994153.


Epoch [5/5] - validation Loss: 3.5043 - validation accuracy:0.09
Trial 4 Completed! Final Validation Accuracy: 0.0881


 Running Trial 5 
Selected Hyperparameters: lr=0.0012262424348634532, batch_size=32, heads=2, hidden_dim=256, optimizer=AdamW
Epoch [1/5] - Training Loss: 3.4675 - Train accuracy:0.12
Epoch [1/5] - validation Loss: 3.4534 - validation accuracy:0.11
Epoch [2/5] - Training Loss: 3.1218 - Train accuracy:0.09
Epoch [2/5] - validation Loss: 3.3755 - validation accuracy:0.30
Epoch [3/5] - Training Loss: 2.9927 - Train accuracy:0.12
Epoch [3/5] - validation Loss: 3.4871 - validation accuracy:0.18
Epoch [4/5] - Training Loss: 2.9112 - Train accuracy:0.10
Epoch [4/5] - validation Loss: 3.5424 - validation accuracy:0.24
Epoch [5/5] - Training Loss: 2.8397 - Train accuracy:0.11


[I 2025-03-21 12:15:13,917] Trial 4 finished with value: 0.3024305555555556 and parameters: {'learning_rate': 0.0012262424348634532, 'batch_size': 32, 'num_heads': 2, 'hidden_dimension': 256, 'optimizers': 'AdamW'}. Best is trial 1 with value: 0.5533625730994153.


Epoch [5/5] - validation Loss: 3.6709 - validation accuracy:0.23
Trial 5 Completed! Final Validation Accuracy: 0.2337


 Best Hyperparameters Found:
{'learning_rate': 0.0024903564969056846, 'batch_size': 16, 'num_heads': 4, 'hidden_dimension': 512, 'optimizers': 'Adam'}
Best model saved successfully!


## Testing the Model

In [33]:
best_model = answergeneration(
   num_classes=301,
   hidden_dim=study.best_params["hidden_dimension"],
   dropout_prob=0.1,
   num_heads=study.best_params["num_heads"]
).to(device)

best_model.load_state_dict(torch.load("best_multimodal_answerable.pth"))
best_model.eval()

test_loader = DataLoader(test_dataset, study.best_params["batch_size"], shuffle=False)

predictions = []

with torch.no_grad():
    for images, questions in test_loader:
        images, questions= images.to(device), questions.to(device)

        outputs = best_model(images, questions).squeeze(dim=-1)
        _,predicted_labels = torch.max(outputs, dim=1)
        predicted_labels = predicted_labels.cpu().numpy()
        predicted_answers = [train_category_id2name[pred] for pred in predicted_labels if pred in train_category_id2name]
        predictions.append(predicted_answers)

print(predictions)

flat_predictions = [answer for batch in predictions for answer in batch]

# Ensuring the lengths match
assert len(top_100_image_urls) == len(flat_predictions), "Mismatch in image URLs and predictions count!"

# Creating the  JSON structure
submission_data = [{"image": img_url, "answer": pred} for img_url, pred in zip(top_100_image_urls, flat_predictions)]

# Saving to JSON file
submission_filename = "Manidatta_Anumandla_challenge2.json"
with open(submission_filename, "w") as json_file:
    json.dump(submission_data, json_file, indent=4)

print(f"Submission file saved as: {submission_filename}")


[['grey', 'other_categories', 'grey', 'other_categories', 'other_categories', 'unanswerable', 'other_categories', 'other_categories', 'other_categories', 'grey', 'unanswerable', 'other_categories', 'unanswerable', 'unanswerable', 'unanswerable', 'unanswerable'], ['other_categories', 'unanswerable', 'unanswerable', 'grey', 'unanswerable', 'other_categories', 'unanswerable', 'unanswerable', 'unanswerable', 'unanswerable', 'unanswerable', 'other_categories', 'unanswerable', 'unanswerable', 'unanswerable', 'grey'], ['unanswerable', 'unanswerable', 'unanswerable', 'unanswerable', 'unanswerable', 'other_categories', 'other_categories', 'other_categories', 'unanswerable', 'unanswerable', 'unanswerable', 'other_categories', 'other_categories', 'unanswerable', 'grey', 'unanswerable'], ['other_categories', 'other_categories', 'unanswerable', 'other_categories', 'unanswerable', 'grey', 'unanswerable', 'other_categories', 'grey', 'unanswerable', 'other_categories', 'unanswerable', 'unanswerable', 

In [45]:
import json

# Load JSON file
submission_filename = "Manidatta_Anumandla_challenge2.json"

with open(submission_filename, "r") as json_file:
    submission_data = json.load(json_file)

# Display first few entries
print("First 5 predictions from the JSON file:")
for entry in submission_data[:]:
    print(entry)


First 5 predictions from the JSON file:
{'image': 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_test_00000000.jpg', 'answer': 'grey'}
{'image': 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_test_00000001.jpg', 'answer': 'other_categories'}
{'image': 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_test_00000002.jpg', 'answer': 'grey'}
{'image': 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_test_00000003.jpg', 'answer': 'other_categories'}
{'image': 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_test_00000004.jpg', 'answer': 'other_categories'}
{'image': 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_test_00000005.jpg', 'answer': 'unanswerable'}
{'image': 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_test_00000006.jpg', 'answer': 'other_categories'}
{'image': 'https://vizwiz.cs.colorado.edu/VizWiz_visualization_img/VizWiz_test_00000007.jpg', 'answer': 'other_categorie

In [46]:
torch.load("Manidatta_Anumandla_challenge1.pkl")

tensor([1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
        1, 0, 1, 1], dtype=torch.int32)