# Read Dataset File of Flickr8K 

In [1]:
import pandas as pd

# Load captions dataset
csv_path = "G:/multimodal_ai/datasets/captions.csv"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


# Test for Image feature extraction with open_clip model

In [2]:
import torch
import open_clip
from PIL import Image
import os

# Load CLIP model
model, preprocess, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")

def extract_image_features(image_path):
    image = Image.open(image_path).convert("RGB")
    image = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        features = model.encode_image(image)
    return features.cpu().numpy()

# Example usage
image_path = "G:/multimodal_ai/datasets/Images/1000268201_693b08cb0e.jpg"
features = extract_image_features(image_path)
print("Feature Shape:", features.shape)

  from .autonotebook import tqdm as notebook_tqdm


Feature Shape: (1, 512)


# Test for Text feature extraction with open_clip model

In [3]:
tokenizer = open_clip.get_tokenizer("ViT-B-32")

def extract_text_features(text):
    tokens = tokenizer([text])
    with torch.no_grad():
        features = model.encode_text(tokens)
    return features.cpu().numpy()

# Example usage
caption = "A child in a pink dress is climbing up a set of stairs"
text_features = extract_text_features(caption)
print("Text Feature Shape:", text_features.shape)

Text Feature Shape: (1, 512)


# Test FAISS for storing and loading the image features

In [4]:
import faiss
import numpy as np
# Create FAISS index
index = faiss.IndexFlatL2(512)  # 512-D embedding space

def add_to_index(features):
    index.add(np.array(features))

# Example usage
add_to_index(features)  # Add image features
print("FAISS Index Size:", index.ntotal)

FAISS Index Size: 1


In [5]:
import os
# Define path
faiss_index_path = "G:/multimodal_ai/models/faiss_index.idx"
# Ensure the directory exists
os.makedirs(os.path.dirname(faiss_index_path), exist_ok=True)
faiss.write_index(index, "G:/multimodal_ai/models/faiss_index.idx")

# To load later
index = faiss.read_index("G:/multimodal_ai/models/faiss_index.idx")
print("Loaded FAISS Index Size:", index.ntotal)

Loaded FAISS Index Size: 1


# Merge Flickr8k and Flickr30k dataset and Merge all 5 captions

In [20]:
import pandas as pd

# Define file paths
captions8k_path = "G:/multimodal_ai/datasets/captions8k.csv"
captions30k_path = "G:/multimodal_ai/datasets/captions30k.csv"
final_csv_path = "G:/multimodal_ai/datasets/final_captions.csv"

# Load both datasets
df_8k = pd.read_csv(captions8k_path)
df_30k = pd.read_csv(captions30k_path)

# Ensure both datasets have the same column names
df_8k = df_8k.rename(columns={"filename": "image"})
df_30k = df_30k.rename(columns={"filename": "image"})

# Combine both datasets
df_combined = pd.concat([df_8k, df_30k], ignore_index=True)

# **Fix: Ensure all captions are strings and handle NaN values**
df_combined["caption"] = df_combined["caption"].astype(str).fillna("")

# Merge all captions for each unique image
df_final = df_combined.groupby("image")["caption"].apply(lambda x: "".join(x)).reset_index()

# Save the final dataset
df_final.to_csv(final_csv_path, index=False)

print(f"✅ Final merged dataset saved at: {final_csv_path}")
print(f"Total Unique Images: {df_final['image'].nunique()}")


✅ Final merged dataset saved at: G:/multimodal_ai/datasets/final_captions.csv
Total Unique Images: 39874


In [21]:
df_final.head()

Unnamed: 0,image,caption
0,1000092795.jpg,Two young guys with shaggy hair look at their...
1,10002456.jpg,Several men in hard hats are operating a gian...
2,1000268201.jpg,A child in a pink dress is climbing up a set ...
3,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
4,1000344755.jpg,Someone in a blue shirt and hat is standing o...


In [22]:
df_final.caption.iloc[0] #This is required to avoid duplicate image features getting stored in the faiss index

' Two young guys with shaggy hair look at their hands while hanging out in the yard . Two young , White males are outside near many bushes . Two men in green shirts are standing in a yard . A man in a blue shirt standing in a garden . Two friends enjoy time spent together .'

# Complete Feature Extraction

In [23]:
from tqdm import tqdm

# Load dataset
csv_path = "G:/multimodal_ai/datasets/final_captions.csv"
image_folder = "G:/multimodal_ai/datasets/Images/"
df = pd.read_csv(csv_path)

print(f"Dataset Loaded: {len(df)} image-caption pairs")

# Load CLIP model
print("Loading CLIP model...")
model, preprocess, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
tokenizer = open_clip.get_tokenizer("ViT-B-32")
print("CLIP model loaded successfully!")

# Create FAISS index
index = faiss.IndexFlatL2(512)  # 512-D embedding space

# Function to extract image features
def extract_image_features(image_path):
    image = Image.open(image_path).convert("RGB")
    image = preprocess(image).unsqueeze(0)
    with torch.no_grad():
        features = model.encode_image(image)
    return features.cpu().numpy()

# Function to extract text features
def extract_text_features(text):
    tokens = tokenizer([text])
    with torch.no_grad():
        features = model.encode_text(tokens)
    return features.cpu().numpy()

# Batch process images & captions
print("Extracting features and indexing data...")
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing Data"):
    image_path = os.path.join(image_folder, row['image'])
    if os.path.exists(image_path):
        image_features = extract_image_features(image_path)  # (1, 512)
        text_features = extract_text_features(row['caption'])  # (1, 512)
        
        # Combine features properly
        combined_features = (image_features + text_features) / 2  # (1, 512)
        combined_features = np.array(combined_features, dtype=np.float32).reshape(1, -1)
        
        # Add to FAISS
        index.add(combined_features)

print(f"Feature extraction complete! Total embeddings stored: {index.ntotal}")

# Save FAISS index
faiss_index_path = "G:/multimodal_ai/models/faiss_index.idx"
os.makedirs(os.path.dirname(faiss_index_path), exist_ok=True)
faiss.write_index(index, faiss_index_path)
print("FAISS index saved successfully!")

Dataset Loaded: 39874 image-caption pairs
Loading CLIP model...




CLIP model loaded successfully!
Extracting features and indexing data...


Processing Data: 100%|█████████████████████████████████████████████████████████| 39874/39874 [1:52:58<00:00,  5.88it/s]


Feature extraction complete! Total embeddings stored: 39874
FAISS index saved successfully!


In [24]:
print("FEATURE EXTRACTION COMPLETED!")

FEATURE EXTRACTION COMPLETED!
