In [None]:
BASE = "/content/drive/MyDrive/abid-mvp"
RAW_IMG  = f"{BASE}/data/raw/bin-images"
RAW_META = f"{BASE}/data/raw/metadata"
PROC     = f"{BASE}/data/processed"
CATALOG  = f"{PROC}/catalog.csv"
SUBSET   = f"{PROC}/subset.csv"

import os, json, pandas as pd, numpy as np
from glob import glob

for p in [RAW_IMG, RAW_META, PROC]:
    assert os.path.isdir(p), f"Missing: {p}"
print("✅ Paths OK")


✅ Paths OK


In [None]:
img_files  = {os.path.basename(p).lower(): p for p in glob(f"{RAW_IMG}/**/*.jpg", recursive=True)}
meta_files = glob(f"{RAW_META}/**/*.json", recursive=True)

rows, skipped = [], 0
for fp in meta_files:
    try:
        with open(fp, "r") as f: d = json.load(f)
        base = os.path.basename(fp)
        jpg  = base.rsplit(".", 1)[0] + ".jpg"
        jpg2 = jpg.lower()
        if jpg2 in img_files:
            d["filename"] = os.path.basename(img_files[jpg2])
            rows.append(d)
        else:
            skipped += 1
    except Exception:
        skipped += 1

df = pd.DataFrame(rows)
os.makedirs(PROC, exist_ok=True)
df.to_csv(CATALOG, index=False)
print("✅ catalog:", df.shape, "| skipped:", skipped)
display(df.head())


✅ catalog: (999, 3) | skipped: 0


Unnamed: 0,BIN_FCSKU_DATA,EXPECTED_QUANTITY,filename
0,"{'B000C33MI2': {'asin': 'B000C33MI2', 'height'...",12,00001.jpg
1,"{'0307981584': {'asin': '0307981584', 'height'...",17,00002.jpg
2,"{'0307981584': {'asin': '0307981584', 'height'...",16,00003.jpg
3,"{'B003E72M1G': {'asin': 'B003E72M1G', 'height'...",5,00004.jpg
4,"{'B003E72M1G': {'asin': 'B003E72M1G', 'height'...",4,00005.jpg


In [None]:
# Extract 'asin' from the nested dictionary in 'BIN_FCSKU_DATA'
def extract_asin(data):
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, dict) and 'asin' in value:
                return value['asin']
    return None

df['asin'] = df['BIN_FCSKU_DATA'].apply(extract_asin)

# Set the correct columns. Use 'EXPECTED_QUANTITY' for quantity.
ASIN_COL = "asin"
QTY_COL  = "EXPECTED_QUANTITY"


# Keep only rows whose image file exists and ASIN_COL is not None
df = df[df["filename"].notna()].copy()

if ASIN_COL in df.columns and QTY_COL in df.columns:
    top_asins = df[ASIN_COL].value_counts().head(40).index
    df_sub = df[df[ASIN_COL].isin(top_asins)].copy()
    df_sub.to_csv(SUBSET, index=False)
    print("✅ subset:", df_sub.shape)
    display(df_sub[ASIN_COL].value_counts().head())
else:
    print("❌ Required columns (asin or EXPECTED_QUANTITY) not found in DataFrame.")
    df_sub = pd.DataFrame() # Create an empty DataFrame if required columns are missing

✅ subset: (99, 4)


Unnamed: 0_level_0,count
asin,Unnamed: 1_level_1
B0033UNIQC,3
B004Q0PN0W,3
B001MKR4Y2,3
B0073E3Q7A,3
B005IZ53CW,3


In [None]:
from ultralytics import YOLO
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity

yolo = YOLO("yolov8n.pt")  # small, fast
SCORE_THR = 0.28  # tune later

def count_asin_in_image(img_path, asin):
    if asin not in asin_emb: return 0, []
    im = Image.open(img_path).convert("RGB")
    res = yolo.predict(source=np.array(im), verbose=False, conf=0.1)[0]
    hits = []
    crops = []
    for (x1,y1,x2,y2) in res.boxes.xyxy.cpu().numpy().astype(int):
        x1,y1 = max(0,x1), max(0,y1)
        x2,y2 = min(im.width,x2), min(im.height,y2)
        if x2-x1<10 or y2-y1<10: continue
        crop = im.crop((x1,y1,x2,y2))
        crops.append(crop)

    if not crops: return 0, []
    embs = np.vstack([embed_image(c) for c in crops])
    sims = cosine_similarity(embs, asin_emb[asin]).ravel()
    idx = [i for i,s in enumerate(sims) if s >= SCORE_THR]
    return len(idx), sorted([(int(i), float(sims[i])) for i in idx], key=lambda x: -x[1])

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt': 100% ━━━━━━━━━━━━ 6.2MB 76.0MB/s 0.1s


In [None]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df_sub, test_size=0.4, random_state=42, stratify=df_sub[ASIN_COL])
train_df.to_csv(f"{PROC}/train.csv", index=False)
val_df.to_csv(f"{PROC}/val.csv", index=False)
print(train_df.shape, val_df.shape)

(59, 4) (40, 4)
