In [5]:
import os
import gzip
import json
import pandas as pd
from glob import glob

# 1. Load images.csv into dictionary for quick lookup
images_df = pd.read_csv("/home/jinesh14/CourseWork/VR_P2/dataset/images/metadata/images.csv")  # adjust path if needed
image_id_to_path = dict(zip(images_df["image_id"], images_df["path"]))

# 2. Target language codes
target_langs = {"en_IN", "en_US", "en_CA", "en_GB", "en_SG", "en_AU"}

# 3. Desired metadata keys
desired_keys = [
    "bullet_point", "color", "color_code", "fabric_type", "item_name", "item_shape", "material",
    "pattern", "product_description", "product_type", "style"
]

# 4. Process all listings JSON files
listing_files = sorted(glob("/home/jinesh14/CourseWork/VR_P2/dataset/abo-listings/listings/metadata/listings_*.json*"))  # adjust path as needed

# 5. Open output file
with open("/home/jinesh14/CourseWork/VR_P2/dataset_curated/shortlisted_listings.jsonl", "w", encoding="utf-8") as outfile:
    for file_path in listing_files:
        open_fn = gzip.open if file_path.endswith(".gz") else open
        with open_fn(file_path, "rt", encoding="utf-8") as f:
            for line in f:
                try:
                    data = json.loads(line)
                    
                    # Check main language from one field
                    language = None
                    for field in ["item_name", "item_keywords", "bullet_point"]:
                        if field in data:
                            for entry in data[field]:
                                if isinstance(entry, dict) and entry.get("language_tag") in target_langs:
                                    language = entry["language_tag"]
                                    break
                        if language:
                            break
                    
                    if not language:
                        continue
                    
                    # Get image_id and map to image_path
                    image_id = data.get("main_image_id")
                    if not image_id or image_id not in image_id_to_path:
                        continue
                    
                    entry = {"image_path": image_id_to_path[image_id]}

                    # Extract selected fields
                    for key in desired_keys:
                        value = data.get(key)
                        if isinstance(value, list):
                            # Only keep entries with matching language
                            simplified = [
                                d["value"] for d in value
                                if isinstance(d, dict) and d.get("language_tag") == language and "value" in d
                            ]
                            if simplified:
                                entry[key] = simplified
                        elif isinstance(value, str) or isinstance(value, list):
                            entry[key] = value


                    # Write to output
                    json.dump(entry, outfile)
                    outfile.write("\n")

                except Exception as e:
                    # Optional: print(f"Error processing line: {e}")
                    continue


Checking unique images in filtered_listings as theres is a possibility that some may be repeated

In [6]:
import json

seen_paths = set()
duplicate_paths = set()

with open('/home/jinesh14/CourseWork/VR_P2/dataset_curated/shortlisted_listings.jsonl', 'r') as f:
    for line in f:
        data = json.loads(line)
        image_path = data.get("image_path")
        if image_path:
            if image_path in seen_paths:
                duplicate_paths.add(image_path)
            else:
                seen_paths.add(image_path)

print(f"Number of duplicated image paths: {len(duplicate_paths)}")
print("Duplicated image paths:")
for path in duplicate_paths:
    print(path)


Number of duplicated image paths: 7316
Duplicated image paths:
d6/d6bf840d.jpg
83/833bef8a.jpg
c5/c500b43b.jpg
a7/a780d49f.jpg
f3/f3849473.jpg
26/26aa104c.jpg
d5/d51d8d1c.jpg
fd/fdfc1d9b.jpg
d4/d4583289.jpg
65/65b9c698.jpg
99/99c247c5.jpg
ca/ca2d1758.jpg
13/1355e883.jpg
91/9148a0d2.jpg
27/271c4a7e.jpg
0e/0e6237e8.jpg
de/dece957a.jpg
5e/5efdd727.jpg
74/74a66f81.jpg
5b/5bb2fbce.jpg
81/814220a2.jpg
5a/5acd599a.jpg
47/47172761.jpg
ff/ff3af5f6.jpg
82/829ec2ec.jpg
60/606ca45b.jpg
37/37bb6b5a.jpg
ca/ca79fa9b.jpg
ec/ec02adca.jpg
65/6544d1ab.jpg
4b/4bec7b82.jpg
21/21235b33.jpg
a2/a2c223e6.jpg
6a/6a4c1730.jpg
c9/c9c957ce.jpg
00/002181be.jpg
92/92360678.jpg
1a/1a84ad00.jpg
34/3440a7e9.jpg
41/410123eb.jpg
32/32eb57a1.jpg
fb/fb424128.jpg
ca/cabadaec.jpg
7d/7d880f39.jpg
d7/d7117b98.jpg
dc/dc23a0a6.jpg
f7/f755af71.jpg
9a/9a1149b9.jpg
26/265b2bd0.jpg
0a/0a58857f.jpg
52/52ba0f52.jpg
c3/c38d3d03.jpg
58/588a0648.jpg
ab/ab8b7247.jpg
74/7439ff6e.jpg
e2/e2275b15.jpg
57/57afb5f3.jpg
fa/fa21b326.jpg
5d/5d2b88

We can see there are repetitions. So we will filter out them to ensure we have only unique entries in filtered_listings. Incase of multiple entries with same image_path we keep that entry which has more number of attribute (hence more context)

In [7]:
import json

input_path = '/home/jinesh14/CourseWork/VR_P2/dataset_curated/shortlisted_listings.jsonl'
output_path = '/home/jinesh14/CourseWork/VR_P2/dataset_curated/filtered_listings.jsonl'

unique_by_image = {}

with open(input_path, 'r', encoding='utf-8') as infile:
    for line in infile:
        line = line.strip()
        if not line:
            continue
        try:
            entry = json.loads(line)
            image_path = entry.get("image_path")
            if not image_path:
                continue

            # If image_path not seen yet OR current entry has more fields
            if image_path not in unique_by_image or len(entry) > len(unique_by_image[image_path]):
                unique_by_image[image_path] = entry
        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSON line: {e}")

with open(output_path, 'w', encoding='utf-8') as outfile:
    for entry in unique_by_image.values():
        json.dump(entry, outfile, ensure_ascii=False)
        outfile.write('\n')

print(f"Filtered to {len(unique_by_image)} unique entries by image_path.")


Filtered to 107554 unique entries by image_path.


Now splits, such that each split has atleast one item from each of the unique 576 product_types.
Number of entries in each split = split_size = 2500
Splits are disjoint

In [8]:
import os
import json
import random
import shutil
from collections import defaultdict
from pathlib import Path

# ---- CONFIG ----
split_names = ['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9']
split_size = 2500
target_product_types = 576
parent_image_dir = "/home/jinesh14/CourseWork/VR_P2/dataset/images/small/"  # <-- UPDATE this to your root image folder
output_base_dir = "/home/jinesh14/CourseWork/VR_P2/dataset_curated"  # where A/, B/, C/ folders go

# Assume filtered_listings is already loaded
# e.g., filtered_listings = json.load(open("filtered_listings.json"))

filtered_listings = []
with open("/home/jinesh14/CourseWork/VR_P2/dataset_curated/filtered_listings.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        filtered_listings.append(json.loads(line))



# Step 1: Organize entries by product_type
by_product_type = defaultdict(list)
for entry in filtered_listings:
    pt = entry.get("product_type")
    if pt:
        by_product_type[pt].append(entry)

# Step 2: Create splits
splits = {name: [] for name in split_names}
used_indices = set()

for name in split_names:
    current_split = []

    # Step 2a: Ensure each product_type appears once
    for pt, entries in by_product_type.items():
        entry = random.choice(entries)
        current_split.append(entry)
        used_indices.add(filtered_listings.index(entry))

    # Step 2b: Fill the rest randomly from remaining pool
    remaining = [e for i, e in enumerate(filtered_listings) if i not in used_indices]
    needed = split_size - len(current_split)
    sampled = random.sample(remaining, needed)
    current_split.extend(sampled)
    used_indices.update(filtered_listings.index(e) for e in sampled)
    splits[name] = current_split

# Step 3: Write to folders
for split_name, entries in splits.items():
    split_dir = os.path.join(output_base_dir, split_name)
    image_dir = os.path.join(split_dir, f"{split_name}_images")
    metadata_dir = os.path.join(split_dir, f"{split_name}_metadata")
    
    os.makedirs(image_dir, exist_ok=True)
    os.makedirs(metadata_dir, exist_ok=True)

    # Write metadata and copy images
    metadata_output = []
    for entry in entries:
        original_path = entry["image_path"]
        #print(f'img: {original_path}')
        filename = os.path.basename(original_path)
        new_image_path = os.path.join(image_dir, filename)
        full_source_path = os.path.join(parent_image_dir, original_path)

        # Update image path in metadata entry
        entry["image_path"] = str(new_image_path)
        metadata_output.append(entry)

        try:
            shutil.copy(full_source_path, new_image_path)
        except Exception as e:
            print(f"Failed to copy {full_source_path}: {e}")

    # Save metadata
    with open(os.path.join(metadata_dir, f"{split_name}_metadata.json"), "w", encoding='utf-8') as f:
        json.dump(metadata_output, f, indent=2, ensure_ascii=False)
