In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os

# Custom Dataset for loading images and corresponding questions
class VQADataset(Dataset):
    def __init__(self, img_dir, questions, transform=None):
        """
        Args:
            img_dir (str): Path to the directory with images.
            questions (dict): A dictionary of {image_filename: question} pairs.
            transform (callable, optional): Optional transform to be applied on an image.
        """
        self.img_dir = img_dir
        self.questions = questions
        self.transform = transform

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        img_name = list(self.questions.keys())[idx]
        image_path = os.path.join(self.img_dir, img_name)
        image = Image.open(image_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        question = self.questions[img_name]
        return image, question


# Image transformations
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Example questions for each image in the folder
questions = {
    "image1.jpg": "What color is the hearing aid?",
    "image2.jpg": "Where is the volume control?",
    # Add more image-question pairs here...
}

# Define Dataset and DataLoader
dataset = VQADataset(img_dir="images", questions=questions, transform=transform)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)


In [2]:
import torch.nn as nn
from torchvision.models import resnet50

# Load pre-trained ResNet50 model
resnet_model = resnet50(pretrained=True)

# Modify ResNet to output features before the final fully connected layer
class FeatureExtractor(nn.Module):
    def __init__(self, original_model):
        super(FeatureExtractor, self).__init__()
        # Remove the last fully connected layer
        self.features = nn.Sequential(*list(original_model.children())[:-1])

    def forward(self, x):
        x = self.features(x)
        return x

# Instantiate the feature extractor
feature_extractor = FeatureExtractor(resnet_model)
feature_extractor.eval()

# Function to extract features for a batch of images
def extract_visual_features(batch_images):
    with torch.no_grad():
        features = feature_extractor(batch_images)
        features = features.view(features.size(0), -1)  # Flatten features to (batch_size, 2048)
    return features


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:02<00:00, 46.6MB/s]


In [4]:
from transformers import LxmertTokenizer, LxmertForQuestionAnswering
import json

# Load pre-trained LXMERT model and tokenizer
lxmert_model = LxmertForQuestionAnswering.from_pretrained("unc-nlp/lxmert-vqa-uncased")
lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-vqa-uncased")

# Random position embeddings (in practice, you'd use bounding boxes for objects)
def generate_random_positions(batch_size, num_objects=36):
    return torch.rand(batch_size, num_objects, 4)  # (batch_size, num_objects, [x_min, y_min, x_max, y_max])

# Function to run VQA for a batch of images and questions
def vqa_batch(images, questions):
    # Extract features from ResNet
    visual_feats = extract_visual_features(images)  # (batch_size, 2048)
    batch_size = visual_feats.shape[0]

    # Repeat features to simulate 36 objects (for simplicity, we duplicate them)
    visual_feats = visual_feats.unsqueeze(1).repeat(1, 36, 1)  # (batch_size, 36, 2048)

    # Generate random position embeddings
    visual_pos = generate_random_positions(batch_size, num_objects=36)

    answers = []
    for i, question in enumerate(questions):
        # Tokenize the question
        inputs = lxmert_tokenizer(question, return_tensors="pt")

        # Get the prediction from the LXMERT model
        outputs = lxmert_model(input_ids=inputs['input_ids'], visual_feats=visual_feats[i].unsqueeze(0), visual_pos=visual_pos[i].unsqueeze(0))

        # Get the predicted answer
        predicted_answer_index = outputs["question_answering_score"].argmax(dim=-1).item()

        # Load the answers.json from Hugging Face to map indices to words
        with open("https://huggingface.co/unc-nlp/lxmert-vqa-uncased/raw/main/data/answers.json") as f:
            answer_list = json.load(f)

        predicted_answer = answer_list[predicted_answer_index]
        answers.append(predicted_answer)

    return answers


config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/856M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [6]:
if __name__ == "__main__":
    # Load a batch of images and questions from the DataLoader
    for batch_images, batch_questions in data_loader:
        # Get the VQA predictions for the batch
        predicted_answers = vqa_batch(batch_images, batch_questions)

        for question, answer in zip(batch_questions, predicted_answers):
            print(f"Q: {question} | Predicted Answer: {answer}")


Q: What color is the hearing aid? | Predicted Answer: grey
