In [1]:
!pip install -qqq sentence-transformers
!pip install -qqq transformers
!pip install -qqq open_clip_pytorch
!pip install -qqq faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m0:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: Could 

In [2]:
# ─── 1) IMPORTS & CONFIG ───────────────────────────────────────────────────────
import os, json, torch, faiss
import numpy as np, pandas as pd
from PIL import Image
from tqdm.auto import tqdm
from torchvision import transforms
from transformers import (
    CLIPProcessor, CLIPModel,
    BlipProcessor,Blip2Processor, BlipForConditionalGeneration, Blip2ForConditionalGeneration,
    AutoTokenizer, AutoModelForCausalLM ,AutoProcessor , pipeline,AutoModelForSeq2SeqLM
)
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer
import regex as re
from PIL import Image

DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
IMG_DIR    = "/kaggle/input/fashion-product-images-dataset/fashion-dataset/images/"
styles_csv = "/kaggle/input/fashion-product-images-dataset/fashion-dataset/styles.csv"
K          = 10  # top-K for both similarity & complementary

# Set your Gemini API key as an environment variable
os.environ["GOOGLE_API_KEY"] = "AIzaSyAM_mhTB1qe4-7QgNy7ONjw9mSob7x5qdw"
import google.generativeai as genai
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# ─── 2) LOAD & PREPARE DF ───────────────────────────────────────────────────────
df = pd.read_csv(
    styles_csv, engine="python", on_bad_lines="skip", dtype=str
)
df['image_path'] = df['id'].astype(str).apply(
    lambda x: f"{IMG_DIR}{x}.jpg"
)
order = ['usage','productDisplayName']
df['text'] = df[order].astype(str).agg(' | '.join, axis=1)

# ─── 3) SETUP MODELS ────────────────────────────────────────────────────────────
# 3a) CLIP for visual search (lighter model + quantization)
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(DEVICE).half()
clip_proc  = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

# 3b) SBERT for text indexing (lighter model)
txt_model = SentenceTransformer("paraphrase-MiniLM-L3-v2", device=DEVICE)

# 3c) BLIP for captioning (quantized)
blip_proc = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
).to(DEVICE).half()


def embed_image(path):
    try:
        img = Image.open(path).convert("RGB")
    except FileNotFoundError:
        print(f"Warning: Image not found: {path} - skipping")
        return None
    inp = clip_proc(images=img, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        emb = clip_model.get_image_features(**inp)  # Fixed: changed 'model' to 'clip_model'
        emb /= emb.norm(p=2, dim=-1, keepdim=True)
    return emb.cpu().numpy()


def embed_text(text: str) -> np.ndarray:
    inputs = clip_proc(text=[text], return_tensors="pt", padding=True).to(DEVICE)
    with torch.no_grad():
        txt_feats = clip_model.get_text_features(**inputs)
    arr = txt_feats.cpu().numpy().astype("float32")
    faiss.normalize_L2(arr)
    return arr

def generate_caption(image_path: str):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_proc(images=image, return_tensors="pt").to(DEVICE)
    out = blip_model.generate(**inputs, max_new_tokens=64)
    caption = blip_proc.decode(out[0], skip_special_tokens=True)
    return caption

def generate_with_gemini(prompt: str) -> str:
    model = genai.GenerativeModel("gemini-1.5-pro")
    response = model.generate_content(prompt)
    return response.text if hasattr(response, 'text') else str(response)

def ask_complements_local(caption, user_prompt, k=K):
    prompt = (
        f"You are a professional fashion stylist."
        f"\nYou are given a product: \"{caption}\"."
        f"\nCustomer said: \"{user_prompt}\"."
        f"\nList exactly 5 complementary and matching  items for this."
        f"\nEach item MUST follow this format strictly:"
        f"\nCategory: <category>; Article Type: <article_type>; Color/Style: <color_or_style>; Usage: <usage>"
        f"\nSeparate each item with '//' on a single line."
        f"\nDO NOT include any explanations or extra text. Only output the 5 formatted items."
        f"\nDo not give same item as shown in the image in the recommendations."
    )
    # Generate response using Gemini
    out = generate_with_gemini(prompt)

    items = [itm.strip() for itm in out.split('//') if itm.strip()][:k]
    numbered_items = [f"{i+1}. {itm}" for i, itm in enumerate(items)]
    return numbered_items

# ─── 4) BUILD INDEXES (first N=200 images for speed, with caching) ─────────────
# Enhanced version of your embedding storage section
# Replace your existing "BUILD INDEXES" section with this:

N = 40000

# Check if embeddings and metadata already exist
if (os.path.exists("img_embs.npy") and 
    os.path.exists("txt_embs.npy") and 
    os.path.exists("filtered_df.csv") and
    os.path.exists("valid_indices.npy")):
    
    print("Loading cached embeddings and metadata...")
    img_embs = np.load("img_embs.npy")
    txt_embs = np.load("txt_embs.npy")
    df = pd.read_csv("filtered_df.csv", dtype=str)
    valid_indices = np.load("valid_indices.npy")
    N = len(df)
    print(f"Loaded {N} valid embeddings")
    
else:
    print("Building embeddings from scratch...")
    img_embs = []
    valid_indices = []
    
    for idx, path in enumerate(tqdm(df["image_path"][:N], desc="ImgEmb")):
        emb = embed_image(path)
        if emb is not None:
            img_embs.append(emb)
            valid_indices.append(idx)
    
    if img_embs:
        img_embs = np.vstack(img_embs)
        valid_indices = np.array(valid_indices)
        
        # Filter dataframe to match valid images
        df_filtered = df.iloc[valid_indices].reset_index(drop=True)
        
        # Generate text embeddings for filtered dataframe
        txt_embs = []
        for meta in tqdm(df_filtered["text"], desc="TxtEmb"):
            txt_embs.append(embed_text(meta))
        txt_embs = np.vstack(txt_embs)
        
        # Save everything for future use
        np.save("img_embs.npy", img_embs)
        np.save("txt_embs.npy", txt_embs)
        np.save("valid_indices.npy", valid_indices)
        df_filtered.to_csv("filtered_df.csv", index=False)
        
        # Update working dataframe
        df = df_filtered
        N = len(df)
        print(f"Saved {N} valid embeddings and metadata")
    else:
        raise ValueError("No valid images found to process")

# Build FAISS indexes (same as before)
fused_embs = np.concatenate([img_embs, txt_embs], axis=1).astype("float32")
faiss.normalize_L2(fused_embs)
sim_index = faiss.IndexFlatIP(fused_embs.shape[1])
sim_index.add(fused_embs)

txt_index = faiss.IndexFlatIP(txt_embs.shape[1])
txt_index.add(txt_embs)

print("Indexes built successfully!")
print(f"Image embeddings shape: {img_embs.shape}")
print(f"Text embeddings shape: {txt_embs.shape}")
print(f"Dataframe shape: {df.shape}")

def recommend(img=None, prompt=None):
    has_img = img is not None
    has_txt = prompt is not None

    if not has_img and not has_txt:
        print("Error: Both image and prompt are missing.")
        return

    sim_df = pd.DataFrame()
    rec_df = pd.DataFrame()

    # ── A) Visual or Text Similarity ─────────────────────────
    if has_img:
        img_emb = embed_image(img)  # shape: (1, 512)
        faiss.normalize_L2(img_emb)

        if has_txt:
            txt_emb = embed_text(prompt)  # shape: (1, 512)
            faiss.normalize_L2(txt_emb)
            qv = np.concatenate([img_emb, txt_emb], axis=1).astype("float32")
            Dv, Iv = sim_index.search(qv, K)
        else:
            Dv, Iv = sim_index.search(img_emb.astype("float32"), K)

        sim_df = df.iloc[Iv[0]][["id", "text"]].copy()
        sim_df["score_img"] = Dv[0]
        print("Top visually similar:")
        print(sim_df)
        sim_df[["id"]].to_csv("/kaggle/working/SimilarProdId.csv", index=False)

    elif has_txt:
        txt_emb = embed_text(prompt)
        faiss.normalize_L2(txt_emb)
        Dt, It = txt_index.search(txt_emb, K)

        sim_df = df.iloc[It[0]][["id", "text"]].copy()
        sim_df["score_txt"] = Dt[0]
        print("Top textually similar:")
        print(sim_df)
        sim_df[["id"]].to_csv("/kaggle/working/SimilarProdId.csv", index=False)

    # ── B) Caption + Complementary Retrieval ─────────────
    if has_txt or has_img:
        caption = generate_caption(img) if has_img else ""
        if caption:
            print("\nBLIP caption:", caption)

        cats = ask_complements_local(caption, prompt if has_txt else "")
        print("\nStylist categories:", cats)

        cand = []
        for cat in cats:
            q_t = embed_text(cat)
            faiss.normalize_L2(q_t)

            Dt, It = txt_index.search(q_t, K)
            dfc = df.iloc[It[0]][["id", "text"]].copy()
            dfc["score_txt"] = Dt[0]
            cand.append(dfc)

        all_rec = pd.concat(cand, ignore_index=True)
        unique_rec = (
            all_rec
            .sort_values("score_txt", ascending=False)
            .drop_duplicates(subset="id", keep="first")
        )
        rec_df = unique_rec.head(K)

        print("\nTop complementary recommendations (unique):")
        print(rec_df)
        rec_df[["id"]].to_csv("/kaggle/working/RecommendationProdId.csv", index=False)

# ─── 6) RUN EXAMPLE ────────────────────────────────────────────────────────────
if __name__ == "__main__":
    # prompt = ''
    prompt = 'blue tshirt'
    recommend(None,prompt)

2025-06-04 12:18:13.931458: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749039494.174150      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749039494.255201      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Building embeddings from scratch...


ImgEmb:   0%|          | 0/40000 [00:00<?, ?it/s]



TxtEmb:   0%|          | 0/39996 [00:00<?, ?it/s]

Saved 39996 valid embeddings and metadata
Indexes built successfully!
Image embeddings shape: (39996, 512)
Text embeddings shape: (39996, 512)
Dataframe shape: (39996, 12)
Top textually similar:
          id                              text  score_txt
23435  41697       Casual | AND Women Blue Top   0.849656
36025  43650      Casual | ONLY Women Blue Top   0.826557
10532  43647      Casual | ONLY Women Blue Top   0.826557
39507   8554  Casual | Basics Men Blue T-shirt   0.825004
26472  29785  Casual | Basics Men Blue T-shirt   0.825004
13701   9931  Casual | Basics Men Blue T-shirt   0.825004
9097    5718  Casual | Basics Men Blue T-shirt   0.825004
35194  41599    Casual | Basics Men Blue Shirt   0.808958
23656  41608    Casual | Basics Men Blue Shirt   0.808958
10272  41589    Casual | Basics Men Blue Shirt   0.808958

Stylist categories: ['1. Category: Bottoms; Article Type: Jeans; Color/Style: Dark Wash; Usage: Casual', '2. Category: Accessories; Article Type: Necklace; Color/Styl