Now for each split and each image-metadata pair we pass API request to Gemini-1.5-flash model. Outputs acquired are written to Si/Si_qa_data.csv for split Si

In [7]:
import os
import json
import pandas as pd
import google.generativeai as genai
from tqdm import tqdm
import time

# --- CONFIG ---
BASE_DIR = '/home/jinesh14/CourseWork/VR_P2/dataset_curated'
SPLIT = 'S1'
META_DIR = os.path.join(BASE_DIR, SPLIT, f'{SPLIT}_metadata')
IMG_DIR = os.path.join(BASE_DIR, SPLIT, f'{SPLIT}_images')
CSV_PATH = os.path.join(BASE_DIR, SPLIT, f'{SPLIT}_qa_data.csv')

# --- Gemini setup ---
genai.configure(api_key="AIzaSyDSDtbmEJ5pwXCjr95qz3QBKoElgvn5Vsw")
model = genai.GenerativeModel('models/gemini-1.5-flash')

# --- Load processed image paths ---
processed_images = set()
if os.path.exists(CSV_PATH):
    try:
        df_existing = pd.read_csv(CSV_PATH, header=None, names=['image_path', 'question', 'answer'])
        processed_images = set(df_existing['image_path'].unique())
        print(f"{len(processed_images)} images already processed.")
    except pd.errors.EmptyDataError:
        print("CSV exists but is empty. Starting fresh.")
        processed_images = set()


# --- Gather unprocessed metadata entries ---
entries = []
for fname in os.listdir(META_DIR):
    if fname.endswith('.json'):
        with open(os.path.join(META_DIR, fname), 'r', encoding='utf-8') as f:
            metadata = json.load(f)
            for entry in metadata:
                image_path = entry.get('image_path')
                if not image_path or not os.path.exists(image_path):
                    continue
                if image_path in processed_images:
                    continue

                # Flatten all list fields into comma-separated strings
                flat_context = []
                for k, v in entry.items():
                    if k == "image_path":
                        continue
                    if isinstance(v, list):
                        v = ', '.join(map(str, v))
                    elif not isinstance(v, str):
                        continue
                    if v.strip():
                        flat_context.append(f"{k}: {v.strip()}")

                context = ', '.join(flat_context)
                entries.append((image_path, context))

print(f"{len(entries)} new images to process.")

# --- Prompt Template ---
PROMPT_TEMPLATE = """Given the product image and the following context: {context},
generate 5 short question and one-word answer pairs that test visual understanding.
Each answer must be a single word (e.g., 'Red', 'Bag', 'Plastic').
Format as: Q1: <question> A1: <answer> ..."""

# --- Process loop ---
output_rows = []

for img_path, context in tqdm(entries):
    try:
        with open(img_path, 'rb') as f:
            image_data = f.read()

        prompt = PROMPT_TEMPLATE.format(context=context)

        response = model.generate_content([
            prompt,
            {"mime_type": "image/jpeg", "data": image_data}
        ])

        time.sleep(4.3)  # Enforce rate limit of ~14 requests per minute

        if not response.text:
            print(f"[EMPTY RESPONSE] {img_path}")
            continue

        lines = response.text.strip().split('\n')

        for line in lines:
            if line.startswith("Q") and "A" in line:
                try:
                    question = line.split("A")[0].split(":")[1].strip()
                    answer = line.split("A")[1].split(":")[1].strip()
                    output_rows.append((img_path, question, answer))
                except:
                    continue

        # Save incrementally
        if output_rows:
            df = pd.DataFrame(output_rows, columns=['image_path', 'question', 'answer'])
            df.to_csv(CSV_PATH, mode='a', header=None, index=False)
            output_rows.clear()

    except Exception as e:
        if 'quota' in str(e).lower():
            print(f"[QUOTA REACHED] Stopped at {img_path}")
            break
        print(f"[ERROR] {img_path}: {str(e)}")
        continue

print("Done.")


2497 images already processed.
3 new images to process.


100%|██████████| 3/3 [00:17<00:00,  5.75s/it]

Done.





In [2]:
import os
import csv
import shutil

# Base directory containing S1...S8
base_dir = "/home/jinesh14/CourseWork/VR_P2/dataset_curated"

# Output Sf folder paths
sf_dir = os.path.join(base_dir, "Sf")
sf_images_dir = os.path.join(sf_dir, "Sf_images")
os.makedirs(sf_images_dir, exist_ok=True)

# Output QA CSV file
sf_csv_path = os.path.join(sf_dir, "Sf_qa_data.csv")

# To avoid copying the same image multiple times
copied_images = set()

with open(sf_csv_path, 'w', newline='') as out_csv:
    writer = csv.writer(out_csv)

    # Loop over S1 to S9
    for i in range(1, 10):
        si_dir = os.path.join(base_dir, f"S{i}")
        si_csv = os.path.join(si_dir, f"S{i}_qa_data.csv")

        if not os.path.exists(si_csv):
            continue  # skip missing files

        with open(si_csv, 'r') as in_csv:
            reader = csv.reader(in_csv)
            for row in reader:
                if len(row) != 3:
                    continue  # skip malformed rows
                image_path, question, answer = row
                if answer.strip():  # non-empty answer
                    filename = os.path.basename(image_path)
                    new_image_path = os.path.join("Sf", "Sf_images", filename)
                    writer.writerow([new_image_path, question, answer])

                    if filename not in copied_images:
                        src_image = image_path
                        dst_image = os.path.join(sf_images_dir, filename)
                        if os.path.isfile(src_image):
                            try:
                                shutil.copy(src_image, dst_image)
                                copied_images.add(filename)
                            except Exception as e:
                                print(f"Failed to copy {src_image}: {e}")


There is probability that the answers are in form of mixed string. For eg: instead of "Red", answer generated by gemini api calls might be "Red/Maroon", "Red(Crimson red)", etc; which do not align with our objective of having single word answers. Hence we find count of such mixed strings and then fix such them.

In [7]:
import csv

# Path to Sf_qa_data.csv
csv_path = '/home/jinesh14/CourseWork/VR_P2/dataset_curated/Sf/Sf_qa_data.csv'

mixed_count = 0
mixed_entries = []

with open(csv_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        if len(row) != 3:
            continue  # Skip malformed rows

        answer = row[2].strip()

        # Check if answer is mixed
        if (
            any(char in answer for char in ['(', ')', '/', ',', ';']) or
            len(answer.split()) > 1
        ):
            mixed_entries.append(row)
            mixed_count += 1

print(f"Total mixed answers: {mixed_count}\n")
print("Examples of mixed entries:")
for entry in mixed_entries[:10]:  # print first 10 for preview
    print(entry)


Total mixed answers: 309

Examples of mixed entries:
['Sf/Sf_images/fc5dc3b7.jpg', "What is the case's texture?", 'Wooden/Printed']
['Sf/Sf_images/0c18d3f9.jpg', "What is visible in the safe's interior?", 'Grey/Black (or similar)']
['Sf/Sf_images/fd4a76c9.jpg', 'What is the case material?', 'Plastic (']
['Sf/Sf_images/bd704d9b.jpg', "What is the case's material?", 'Plastic  *(Note']
['Sf/Sf_images/d330c87a.jpg', 'What brand is featured on the case?', 'Numark (']
['Sf/Sf_images/f795a677.jpg', 'What is the pattern design?', 'Chevron/Zigzag']
['Sf/Sf_images/83b553e8.jpg', 'What is the predominant color scheme?', 'Multicolored (or Colors)']
['Sf/Sf_images/a6038145.jpg', 'What material is the rug?', 'Jute (or similar natural fiber)']
['Sf/Sf_images/444b4e8d.jpg', 'What is the number at the top of the clock?', 'Twelve (or 12)']
['Sf/Sf_images/b90981e2.jpg', 'What material are the hands?', 'Metal (Note']


Fixing the mixed strings and writing the final cleaned version to Sf/Sf_qa_data_cleaned.csv

In [8]:
import csv
import os

# Paths
csv_input_path = '/home/jinesh14/CourseWork/VR_P2/dataset_curated/Sf/Sf_qa_data.csv'
csv_output_path = '/home/jinesh14/CourseWork/VR_P2/dataset_curated/Sf/Sf_qa_data_cleaned.csv'

def extract_first_atomic(answer):
    # Split by common separators
    for sep in ['(', '/', ',', ';']:
        if sep in answer:
            answer = answer.split(sep)[0]
    # Also remove trailing whitespace and keep first word if still compound
    answer = answer.strip().split()[0]
    return answer

with open(csv_input_path, 'r') as infile, open(csv_output_path, 'w', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    for row in reader:
        if len(row) != 3:
            continue  # skip malformed
        image_path, question, answer = row
        cleaned_answer = extract_first_atomic(answer.strip())
        writer.writerow([image_path, question, cleaned_answer])

print(f"Cleaned CSV written to: {csv_output_path}")


Cleaned CSV written to: /home/jinesh14/CourseWork/VR_P2/dataset_curated/Sf/Sf_qa_data_cleaned.csv


Making a smaller version of Sf_qa_data_cleaned.csv by taking only 3 question-answers per image

In [9]:
import pandas as pd
import os

# Input and output file paths
input_csv = "/home/jinesh14/CourseWork/VR_P2/dataset_curated/Sf/Sf_qa_data_cleaned.csv"
output_csv = "/home/jinesh14/CourseWork/VR_P2/dataset_curated/Sf/Sf_qa_data_trimmed.csv"

# Load the cleaned CSV (no header in file)
df = pd.read_csv(input_csv, header=None, names=["img_path", "question", "answer"])

# Keep only the first 3 questions per image
df_trimmed = df.groupby("img_path").head(3)

# Write the trimmed data to output CSV (no header or index)
df_trimmed.to_csv(output_csv, index=False, header=False)

# Print summary
print(f"Trimmed CSV created at: {output_csv}")
print(f"Total images retained: {df_trimmed['img_path'].nunique()}")
print(f"Total QA pairs written: {len(df_trimmed)}")


Trimmed CSV created at: /home/jinesh14/CourseWork/VR_P2/dataset_curated/Sf/Sf_qa_data_trimmed.csv
Total images retained: 19952
Total QA pairs written: 59856


Making 80-20 train-test split on trimmed dataset

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the original dataset
input_file = "/home/jinesh14/CourseWork/VR_P2/dataset_curated/Sf/Sf_qa_data_trimmed.csv"
df = pd.read_csv(input_file, header=None, names=["image_path", "question", "answer"])

# Perform 80:20 train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

# Save to separate files
train_df.to_csv("Sf_qa_data_trimmed_train_r.csv", index=False, header=False)
test_df.to_csv("Sf_qa_data_trimmed_test_r.csv", index=False, header=False)

print("Split completed and files saved.")


Split completed and files saved.
