a. Use Pre-trained CNN (like ResNet50) for images

In [None]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

In [None]:
# Pre-trained ResNet50
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
resnet.eval()



Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [None]:
# Image preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [None]:
# Extract features
def extract_image_features(img_path):
    img = Image.open(img_path).convert("RGB")
    img_tensor = transform(img).unsqueeze(0)
    with torch.no_grad():
        features = resnet(img_tensor)
    return features.squeeze().numpy()

image_folder = "/home/BTECH_7TH_SEM/Desktop/VII Sem/MML/MS-COCO/val2017"
image_features = {}

for img_file in tqdm(os.listdir(image_folder)):
    img_path = os.path.join(image_folder, img_file)
    feat = extract_image_features(img_path)
    image_id = int(img_file.split(".")[0])  # COCO uses numeric IDs
    image_features[image_id] = feat

np.save("val2017_resnet50_features.npy", image_features)

100%|███████████████████████████████████████| 5000/5000 [03:21<00:00, 24.76it/s]


b. Use Pre-trained BERT for text

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased")
bert.eval()

def extract_text_features(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = bert(**inputs)
    return outputs.last_hidden_state[:,0,:].squeeze().numpy()

with open("/home/BTECH_7TH_SEM/Desktop/VII Sem/MML/MS-COCO/annotations_trainval2017/annotations/captions_val2017.json", "r") as f:
    captions_data = json.load(f)

caption_features = {}
for ann in tqdm(captions_data["annotations"]):
    img_id = ann["image_id"]
    text = ann["caption"]
    feat = extract_text_features(text)
    if img_id not in caption_features:
        caption_features[img_id] = []
    caption_features[img_id].append(feat)

np.save("val2017_bert_features.npy", caption_features)

100%|█████████████████████████████████████| 25014/25014 [05:48<00:00, 71.74it/s]


In [None]:
img_features = np.load("/home/BTECH_7TH_SEM/Desktop/VII Sem/MML/val2017_resnet50_features.npy", allow_pickle=True).item()
cap_features = np.load("/home/BTECH_7TH_SEM/Desktop/VII Sem/MML/val2017_bert_features.npy", allow_pickle=True).item()

print("Number of images:", len(img_features))
print("Number of caption entries:", len(cap_features))

# Pick one image
img_id = list(img_features.keys())[0]

print("Image ID:", img_id)
print("Image feature shape:", img_features[img_id].shape)
print("First 10 values:", img_features[img_id][:10])

print("Caption feature shape:", cap_features[img_id][0].shape)
print("First 10 values:", cap_features[img_id][0][:10])

In [None]:
img_path = "/home/BTECH_7TH_SEM/Desktop/VII Sem/MML/MS-COCO/val2017/000000003845.jpg"

img_id = int(os.path.splitext(os.path.basename(img_path))[0])

img = Image.open(img_path).convert("RGB")
plt.imshow(img)
plt.axis("off")
plt.title(f"Image ID: {img_id}")
plt.show()

captions_file = "/home/BTECH_7TH_SEM/Desktop/VII Sem/MML/MS-COCO/annotations_trainval2017/annotations/captions_val2017.json"
with open(captions_file, "r") as f:
    data = json.load(f)

print("\nCaptions for this image:")
for ann in data["annotations"]:
    if ann["image_id"] == img_id:
        print("-", ann["caption"])