In [5]:
import os
import json

# ─── CONFIG ────────────────────────────────────────────────────────────────────
LISTINGS_DIR = '/home/jinesh14/CourseWork/VR_P2/dataset/abo-listings/listings/metadata'
# ────────────────────────────────────────────────────────────────────────────────

# Sets to accumulate unique values
unique_product_types = set()
unique_color_codes   = set()
unique_styles        = set()

# Iterate over all files listings_0.json … listings_f.json
for fn in sorted(os.listdir(LISTINGS_DIR)):
    if not fn.startswith("listings_") or not fn.endswith(".json"):
        continue

    path = os.path.join(LISTINGS_DIR, fn)
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            
            # 1. product_type (check if it's a list or string)
            pt = data.get('product_type')
            if pt:
                if isinstance(pt, list):
                    for p in pt:
                        if isinstance(p, dict):
                            # If p is a dict, extract its 'value' field (if present)
                            p_value = p.get('value')
                            if p_value:
                                unique_product_types.add(p_value)
                        else:
                            unique_product_types.add(p)
                else:
                    unique_product_types.add(pt)
            
            # 2. color_code (list of strings)
            for cc in data.get('color_code', []):
                unique_color_codes.add(cc)
            
            # 3. style (list of dicts: extract the 'value' field)
            for s in data.get('style', []):
                val = s.get('value')
                if val:
                    unique_styles.add(val)

# Print results
print(f"Unique product_type ({len(unique_product_types)}):")
print(sorted(unique_product_types), end='\n\n')

print(f"Unique color_code ({len(unique_color_codes)}):")
print(sorted(unique_color_codes), end='\n\n')

print(f"Unique style ({len(unique_styles)}):")
print(sorted(unique_styles))


Unique product_type (576):
['ABIS_BEAUTY', 'ABIS_BOOK', 'ABIS_DRUGSTORE', 'ABIS_ELECTRONICS', 'ABIS_HOME_IMPROVEMENT', 'ABIS_KITCHEN', 'ABIS_LAWN_AND_GARDEN', 'ABIS_PET_PRODUCTS', 'ABIS_VIDEO_GAMES', 'ACCESSORY', 'ACCESSORY_OR_PART_OR_SUPPLY', 'AGRICULTURAL_SUPPLIES', 'AIR_COMPRESSOR', 'AIR_CONDITIONER', 'AIR_FRYER', 'AIR_MATTRESS', 'AIR_PUMP', 'AIR_PURIFIER', 'AMAZON_BOOK_READER_ACCESSORY', 'AMAZON_TABLET_ACCESSORY', 'ANIMAL_COLLAR', 'ANIMAL_LITTER', 'ANTENNA', 'AREA_DEODORIZER', 'ARTIFICIAL_PLANT', 'ARTIFICIAL_TREE', 'ART_AND_CRAFT_SUPPLY', 'ASTRINGENT_SUBSTANCE', 'AUDIO_OR_VIDEO', 'AUTO_ACCESSORY', 'AUTO_CHEMICAL', 'AUTO_OIL', 'AUTO_PART', 'AV_FURNITURE', 'AV_RECEIVER', 'BABY_BOTTLE', 'BABY_PRODUCT', 'BACKPACK', 'BADGE_HOLDER', 'BAG', 'BAKEWARE', 'BAKING_CUP', 'BAKING_MIX', 'BAKING_PAN', 'BAKING_PAPER', 'BARBECUE_GRILL', 'BARBELL', 'BASKET', 'BATHWATER_ADDITIVE', 'BATTERY', 'BEAN_BAG_CHAIR', 'BEAUTY', 'BED', 'BED_FRAME', 'BENCH', 'BINOCULAR', 'BISS', 'BLANKET', 'BLANK_MEDIA', 'BLOOD

Exploring what all languages used in metadata.

In [1]:
import os
import json
from collections import Counter

# ─── CONFIG ────────────────────────────────────────────────────────────────────
LISTINGS_DIR = '/home/jinesh14/CourseWork/VR_P2/dataset/abo-listings/listings/metadata'
# ────────────────────────────────────────────────────────────────────────────────

# Fields that can have multilingual entries
multilingual_fields = [
    'brand', 'bullet_point', 'fabric_type', 'finish_type', 'item_keywords',
    'item_name', 'item_shape', 'material', 'model_name', 'model_number',
    'model_year', 'pattern', 'product_description', 'style'
]

language_product_counter = Counter()

# Iterate over all listings files
for fn in sorted(os.listdir(LISTINGS_DIR)):
    if not fn.startswith("listings_") or not fn.endswith(".json"):
        continue

    path = os.path.join(LISTINGS_DIR, fn)
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            languages_used = set()

            for field in multilingual_fields:
                entries = data.get(field, [])
                if isinstance(entries, list):
                    for entry in entries:
                        if isinstance(entry, dict):
                            lang = entry.get('language_tag')
                            if lang:
                                languages_used.add(lang)

            for lang in languages_used:
                language_product_counter[lang] += 1

# Print results
for lang, count in language_product_counter.most_common():
    print(f"{lang}: {count} products")


en_IN: 76443 products
en_US: 26426 products
de_DE: 15100 products
es_US: 12016 products
zh_CN: 11708 products
pt_BR: 9953 products
ko_KR: 8778 products
zh_TW: 8766 products
en_GB: 8147 products
he_IL: 8016 products
hi_IN: 7463 products
en_CA: 6852 products
ar_AE: 6631 products
es_MX: 4915 products
es_ES: 4549 products
fr_FR: 4189 products
it_IT: 3965 products
ja_JP: 3738 products
nl_NL: 3559 products
fr_CA: 2403 products
ml_IN: 2396 products
tr_TR: 2172 products
en_AU: 2149 products
cs_CZ: 1892 products
pl_PL: 1876 products
en_AE: 1561 products
te_IN: 1354 products
en_SG: 1340 products
pt_PT: 1115 products
sv_SE: 1033 products
ta_IN: 593 products
kn_IN: 582 products
mr_IN: 2 products


The metadata analysis shows that English (India) is used in the majority of listings, with over 76,000 entries, followed by English (US). Our goal is to sample images associated with metadata in these two languages to ensure that the text embeddings learned by our vision-language model are meaningful and not influenced by unrelated or less common languages.