Person 2) Create a model to run similarity searches on the image directly and find similarities across the web, possibly directly training on some common items or shopping database if possible 


In [7]:
!pip install clip-interrogator==0.6.0          \
            openai-clip                       \
            pillow requests tqdm              \
            aiohttp aiofiles                  \
            faiss-cpu                         \
            python-dotenv




[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
"""
quickshop.py – find visually-similar products on the open web
--------------------------------------------------------------

query.jpg           --->  + CLIP Interrogator  --->  prompt text
                            |
                            +--> Bing Image Search (q=prompt)
                                   |
                                   v
                            thumbnails + product pages
                                   |
                            +--> CLIP embed each thumb
query.jpg  --CLIP embed-->  |       |
                            +-- cosine-sim --> ranked list
"""
import os, io, json, asyncio, textwrap, hashlib
from pathlib import Path
from typing  import List, Tuple

import requests, aiohttp, aiofiles, tqdm, numpy as np, faiss, torch
from PIL import Image
from clip_interrogator import Config, Interrogator         # :contentReference[oaicite:0]{index=0}
import clip                                                # OpenAI CLIP

# ------------------------------------------------------------
# 1)  GLOBALS & MODEL LOAD
# ------------------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 1-a  CLIP Interrogator (for caption) – uses BLIP-L + ViT-L/14
ci_cfg = Config(clip_model_name="ViT-L/14")
ci_cfg.apply_low_vram_defaults()
ci = Interrogator(ci_cfg)

# 1-b  Plain CLIP encoder (same backbone) for fast batching
clip_model, clip_preproc = clip.load("ViT-L/14", device=DEVICE)

# ------------------------------------------------------------
# 2)  HELPERS
# ------------------------------------------------------------
def caption_image(img_path: Path) -> str:
    """Return the best text prompt for the given image."""
    img = Image.open(img_path).convert("RGB")
    return ci.interrogate(img)

def embed_pil(im: Image.Image) -> np.ndarray:
    """Return L2-normalised CLIP embedding from a PIL image."""
    with torch.no_grad():
        tensor = clip_preproc(im).unsqueeze(0).to(DEVICE)
        vec    = clip_model.encode_image(tensor)
        vec    = vec / vec.norm(dim=-1, keepdim=True)
    return vec.cpu().numpy().astype("float32")

async def fetch_json(session: aiohttp.ClientSession, url: str) -> dict:
    headers = {"Ocp-Apim-Subscription-Key": os.environ["BING_SUBSCRIPTION_KEY"]}
    async with session.get(url, headers=headers) as resp:
        resp.raise_for_status()
        return await resp.json()

async def save_thumb(session: aiohttp.ClientSession,
                     url: str, dst: Path) -> Path:
    try:
        async with session.get(url) as resp:
            resp.raise_for_status()
            data = await resp.read()
        async with aiofiles.open(dst, "wb") as f:
            await f.write(data)
        return dst
    except Exception:
        return None

# ------------------------------------------------------------
# 3)  CORE – SEARCH & RE-RANK
# ------------------------------------------------------------
async def bing_search(prompt: str, top_n: int = 80) -> List[dict]:
    """Return Bing Image Search results for `prompt`."""
    endpoint = os.environ["BING_ENDPOINT"].rstrip("/") + "/v7.0/images/search"
    params   = {"q": prompt, "count": str(top_n), "safeSearch": "Moderate"}
    async with aiohttp.ClientSession() as sess:
        data = await fetch_json(sess, endpoint, params=params)
    return data.get("value", [])

async def download_thumbs(results: List[dict],
                          cache_dir: Path) -> List[Tuple[dict, Path]]:
    out = []
    async with aiohttp.ClientSession() as sess:
        tasks = []
        for r in results:
            url  = r.get("thumbnailUrl") or r.get("contentUrl")
            name = hashlib.md5(url.encode()).hexdigest() + ".jpg"
            dst  = cache_dir / name
            tasks.append(save_thumb(sess, url, dst))
        for r, t in tqdm.tqdm(zip(results, asyncio.as_completed(tasks)),
                              total=len(tasks), desc="dl thumbs"):
            img_path = await t
            if img_path:
                out.append((r, img_path))
    return out

def rank_by_clip(query_vec: np.ndarray, imgs: List[Tuple[dict, Path]],
                 top_k: int = 10) -> List[Tuple[float, dict]]:
    dim     = query_vec.shape[1]
    index   = faiss.IndexFlatIP(dim)
    vectors = []
    meta    = []
    for r, p in imgs:
        try:
            vec = embed_pil(Image.open(p))
            vectors.append(vec)
            meta.append(r)
        except Exception:
            continue
    if not vectors:
        return []
    x = np.vstack(vectors)
    index.add(x)
    D, I = index.search(query_vec, top_k)
    return [(float(D[0][i]), meta[I[0][i]]) for i in range(len(I[0]))]

# ------------------------------------------------------------
# 4)  DRIVER
# ------------------------------------------------------------
async def find_similar_products(query_img: str,
                                k: int = 10,
                                thumb_cache: str = ".thumbs") -> None:
    q_path = Path(query_img)
    print("🔍 Interrogating image…")
    prompt = caption_image(q_path)
    print("📝 Prompt:", prompt)

    print("🔎 Querying Bing Image Search…")
    results = await bing_search(prompt, top_n=100)

    cache_dir = Path(thumb_cache); cache_dir.mkdir(exist_ok=True)
    print(f"📥 Downloading thumbnails ({len(results)} candidates)…")
    img_info = await download_thumbs(results, cache_dir)

    print("⚖️ Ranking by visual similarity…")
    q_vec = embed_pil(Image.open(q_path))
    ranked = rank_by_clip(q_vec, img_info, top_k=k)

    print("\n🏆 TOP MATCHES")
    for score, r in ranked:
        print(f"{score:5.3f}  {r['hostPageUrl'][:90]}")

  from .autonotebook import tqdm as notebook_tqdm


Loading caption model blip-base...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
ERROR:root:Model config for ViT-L not found; available models ['coca_base', 'coca_roberta-ViT-B-32', 'coca_ViT-B-32', 'coca_ViT-L-14', 'convnext_base', 'convnext_base_w', 'convnext_base_w_320', 'convnext_large', 'convnext_large_d', 'convnext_large_d_320', 'convnext_small', 'convnext_tiny', 'convnext_xlarge', 'convnext_xxlarge', 'convnext_xxlarge_320', 'EVA01-g-14', 'EVA01-g-14-plus', 'EVA02-B-16', 'EVA02-E-14', 

Loading CLIP model ViT-L/14...


RuntimeError: Model config for ViT-L not found.

In [None]:
import argparse, asyncio
a = argparse.ArgumentParser(description="open-web product finder")
a.add_argument("image", help="path to query image")
a.add_argument("-k", "--top_k", type=int, default=10)
args = a.parse_args()
asyncio.run(find_similar_products(args.image, args.top_k))