In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install segment-anything-hq
!pip install faiss-cpu opencv-python torch torchvision einops ftfy regex tqdm coloredlogs transformers

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-iqgkuq7a
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-iqgkuq7a
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=2c4830e7f914dd7fd75e756115829c3c2c8426d1cdf0ba7621e4d7a046726d6a
  Stored in directory: /tmp/pip-ephem-wheel-cache-tk2446iv/wheels/35/3e/df/3d24cbfb3b6a06f17

In [None]:
!mkdir -p sam-hq/pretrained_checkpoint
!wget -O sam-hq/pretrained_checkpoint/sam_hq_vit_l.pth https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_l.pth

--2026-01-22 02:02:58--  https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_l.pth
Resolving huggingface.co (huggingface.co)... 18.239.50.49, 18.239.50.16, 18.239.50.103, ...
Connecting to huggingface.co (huggingface.co)|18.239.50.49|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://us.gcp.cdn.hf.co/xet-bridge-us/6486dc523457cf1120c70b8b/4e77070e24978d365f08f5fa01fb6db43f50eda87cb688f3d5dea189044d6f15?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27sam_hq_vit_l.pth%3B+filename%3D%22sam_hq_vit_l.pth%22%3B&Expires=1769050978&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiRXBvY2hUaW1lIjoxNzY5MDUwOTc4fX0sIlJlc291cmNlIjoiaHR0cHM6Ly91cy5nY3AuY2RuLmhmLmNvL3hldC1icmlkZ2UtdXMvNjQ4NmRjNTIzNDU3Y2YxMTIwYzcwYjhiLzRlNzcwNzBlMjQ5NzhkMzY1ZjA4ZjVmYTAxZmI2ZGI0M2Y1MGVkYTg3Y2I2ODhmM2Q1ZGVhMTg5MDQ0ZDZmMTVcXD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=n0aMPQAJM54dgRx%7E%7E8GcEhUJBv0V7MN0nYIL3iKmuJ9G3Br6keDgVri8ovQucY

In [None]:
import os
import torch
import numpy as np
import cv2
import faiss
from sklearn.metrics.pairwise import cosine_similarity
from segment_anything_hq import sam_model_registry, SamAutomaticMaskGenerator
from PIL import Image
import clip

def load_images_from_folder(folder, max_images=8):
    if not os.path.exists(folder):
        print(f"Folder not found: {folder}")
        return []
    files = [f for f in os.listdir(folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    if not files:
        print(f"No images in folder: {folder}")
        return []
    print(f"Found {len(files)} images in {folder}")
    return [Image.open(os.path.join(folder, f)).convert("RGB") for f in files[:max_images]]

def initialize_models():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    model, transform = clip.load("ViT-L/14", device=device)
    sam_checkpoint = "sam-hq/pretrained_checkpoint/sam_hq_vit_l.pth"
    sam = sam_model_registry["vit_l"](checkpoint=sam_checkpoint)
    sam.to(device=device)
    return model, transform, sam, device

def get_vector(image, model, transform, device):
    t_img = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        embedding = model.encode_image(t_img)
    return embedding.squeeze().cpu().numpy()

def extract_features_from_masks(image_np, masks, model, transform, device):
    features = []
    for mask in masks:
        seg = mask['segmentation']
        masked = np.zeros_like(image_np)
        masked[seg] = image_np[seg]
        pil_img = Image.fromarray(masked)
        vec = get_vector(pil_img, model, transform, device)
        features.append(vec)
    return np.array(features)

def calculate_attention_weights_softmax(query, examples):
    if len(examples) == 0:
        return np.array([1.0])
    sim = cosine_similarity(query.reshape(1, -1), examples).flatten()
    exp = np.exp(sim - np.max(sim))
    return exp / exp.sum()

def adjust_embedding(query, pos_emb, neg_emb):
    w_pos = calculate_attention_weights_softmax(query, pos_emb)
    w_neg = calculate_attention_weights_softmax(query, neg_emb)
    adj_pos = np.sum(w_pos[:, None] * pos_emb, axis=0) if len(pos_emb) > 0 else np.zeros_like(query)
    adj_neg = np.sum(w_neg[:, None] * neg_emb, axis=0) if len(neg_emb) > 0 else np.zeros_like(query)
    return adj_pos - adj_neg

def annotate_image(example_img, query_vectors, model, transform, sam, device, output_path='annotated_output.jpg'):
    print("Generating SAM masks...")
    mask_gen = SamAutomaticMaskGenerator(
        model=sam,
        points_per_side=48,
        pred_iou_thresh=0.88,
        stability_score_thresh=0.92,
        min_mask_region_area=150,
    )
    masks = mask_gen.generate(np.array(example_img))
    print(f"→ Generated {len(masks)} masks")

    img_np = np.array(example_img)
    mask_vecs = extract_features_from_masks(img_np, masks, model, transform, device)
    mask_vecs = mask_vecs.astype(np.float32)

    query_norm = query_vectors / np.linalg.norm(query_vectors, axis=1, keepdims=True)
    mask_norm  = mask_vecs     / np.linalg.norm(mask_vecs,     axis=1, keepdims=True)

    index = faiss.IndexFlatIP(768)
    index.add(query_norm)

    D, I = index.search(mask_norm, 1)
    sim_norm = (D + 1) / 2
    thresh = 0.485

    good_idx = np.where(sim_norm.flatten() > thresh)[0]
    print(f"→ {len(good_idx)} regions above threshold {thresh}")

    img_cv = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
    for idx in good_idx:
        seg = masks[idx]['segmentation']
        ys, xs = np.nonzero(seg)
        if len(xs) == 0: continue
        x1, y1, x2, y2 = xs.min(), ys.min(), xs.max(), ys.max()
        cv2.rectangle(img_cv, (x1, y1), (x2, y2), (0, 255, 0), 3)
        cv2.putText(img_cv, "match", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,255,0), 2)

    img_cv = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
    Image.fromarray(img_cv).save(output_path)
    print(f"→ Annotated image saved: {output_path}")

def detect_and_annotate_objects(example_image_path, keyword):
    if not os.path.exists(example_image_path):
        print(f"Error: Example image not found → {example_image_path}")
        return

    model, transform, sam, device = initialize_models()

    pos_folder = "query_surfboard"
    neg_folder = "query_background_clutter"

    pos_imgs = load_images_from_folder(pos_folder)
    neg_imgs = load_images_from_folder(neg_folder)

    if not pos_imgs:
        print("Error: No positive images found. Upload to folder and retry.")
        return

    print("Extracting CLIP features from positive examples...")
    pos_emb = np.array([get_vector(img, model, transform, device) for img in pos_imgs])

    print("Extracting CLIP features from negative examples...")
    neg_emb = np.array([get_vector(img, model, transform, device) for img in neg_imgs]) if neg_imgs else np.empty((0, 768))

    # Multimodal text embeddings
    pos_text = clip.tokenize([f"a clear photo of a {keyword}"]).to(device)
    pos_text_emb = model.encode_text(pos_text).squeeze(0).cpu().numpy()
    pos_emb = np.vstack([pos_emb, pos_text_emb[None]])

    if len(neg_emb) > 0:
        neg_text = clip.tokenize(["a photo of background clutter"]).to(device)
        neg_text_emb = model.encode_text(neg_text).squeeze(0).cpu().numpy()
        neg_emb = np.vstack([neg_emb, neg_text_emb[None]])
    else:
        neg_emb = np.empty((1, 768))

    print("Computing adjusted embeddings...")
    adjusted = np.array([adjust_embedding(e, pos_emb, neg_emb) for e in pos_emb])

    example_img = Image.open(example_image_path).convert("RGB")
    annotate_image(example_img, adjusted, model, transform, sam, device)

# Run
detect_and_annotate_objects("surfboards-boardshop.jpg", "surfboard")

  return register_model(fn_wrapper)
  return register_model(fn_wrapper)
  return register_model(fn_wrapper)
  return register_model(fn_wrapper)
  return register_model(fn_wrapper)


Using device: cuda


100%|████████████████████████████████████████| 890M/890M [00:06<00:00, 152MiB/s]


<All keys matched successfully>
Found 8 images in query_surfboard
Found 8 images in query_background_clutter
Extracting CLIP features from positive examples...
Extracting CLIP features from negative examples...


RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [None]:
import os
import torch
import numpy as np
import cv2
import faiss
from sklearn.metrics.pairwise import cosine_similarity
from segment_anything_hq import sam_model_registry, SamAutomaticMaskGenerator
from PIL import Image
import clip

def load_images_from_folder(folder, max_images=8):
    if not os.path.exists(folder):
        print(f"Folder not found: {folder}")
        return []
    files = [f for f in os.listdir(folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    if not files:
        print(f"No images in folder: {folder}")
        return []
    print(f"Found {len(files)} images in {folder}")
    return [Image.open(os.path.join(folder, f)).convert("RGB") for f in files[:max_images]]

def initialize_models():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    model, transform = clip.load("ViT-L/14", device=device)
    sam_checkpoint = "sam-hq/pretrained_checkpoint/sam_hq_vit_l.pth"
    sam = sam_model_registry["vit_l"](checkpoint=sam_checkpoint)
    sam.to(device=device)
    return model, transform, sam, device

def get_vector(image, model, transform, device):
    t_img = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        embedding = model.encode_image(t_img)
    return embedding.squeeze().cpu().numpy()

def extract_features_from_masks(image_np, masks, model, transform, device):
    features = []
    for mask in masks:
        seg = mask['segmentation']
        masked = np.zeros_like(image_np)
        masked[seg] = image_np[seg]
        pil_img = Image.fromarray(masked)
        vec = get_vector(pil_img, model, transform, device)
        features.append(vec)
    return np.array(features)

def calculate_attention_weights_softmax(query, examples):
    if len(examples) == 0:
        return np.array([1.0])
    sim = cosine_similarity(query.reshape(1, -1), examples).flatten()
    exp = np.exp(sim - np.max(sim))
    return exp / exp.sum()

def adjust_embedding(query, pos_emb, neg_emb):
    w_pos = calculate_attention_weights_softmax(query, pos_emb)
    w_neg = calculate_attention_weights_softmax(query, neg_emb)
    adj_pos = np.sum(w_pos[:, None] * pos_emb, axis=0) if len(pos_emb) > 0 else np.zeros_like(query)
    adj_neg = np.sum(w_neg[:, None] * neg_emb, axis=0) if len(neg_emb) > 0 else np.zeros_like(query)
    return adj_pos - adj_neg

def annotate_image(example_img, query_vectors, model, transform, sam, device, output_path='annotated_output.jpg'):
    print("Generating SAM masks...")
    mask_gen = SamAutomaticMaskGenerator(
        model=sam,
        points_per_side=48,
        pred_iou_thresh=0.88,
        stability_score_thresh=0.92,
        min_mask_region_area=150,
    )
    masks = mask_gen.generate(np.array(example_img))
    print(f"→ Generated {len(masks)} masks")

    img_np = np.array(example_img)
    mask_vecs = extract_features_from_masks(img_np, masks, model, transform, device)
    mask_vecs = mask_vecs.astype(np.float32)

    query_norm = query_vectors / np.linalg.norm(query_vectors, axis=1, keepdims=True)
    mask_norm  = mask_vecs     / np.linalg.norm(mask_vecs,     axis=1, keepdims=True)

    index = faiss.IndexFlatIP(768)
    index.add(query_norm)

    D, I = index.search(mask_norm, 1)
    sim_norm = (D + 1) / 2
    thresh = 0.40

    good_idx = np.where(sim_norm.flatten() > thresh)[0]
    print(f"→ {len(good_idx)} regions above threshold {thresh}")

    img_cv = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
    for idx in good_idx:
        seg = masks[idx]['segmentation']
        ys, xs = np.nonzero(seg)
        if len(xs) == 0:
            continue

        x1, y1, x2, y2 = xs.min(), ys.min(), xs.max(), ys.max()


        width = x2 - x1
        height = y2 - y1

        if width < 50 or height < 50 or width / height > 5 or height / width > 5:
            continue  # skip very small or very elongated boxes (often people/parts)
        # ──────────────────────────────────────────────────────────────────

        cv2.rectangle(img_cv, (x1, y1), (x2, y2), (0, 255, 0), 3)
        cv2.putText(img_cv, "match", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,255,0), 2)

    img_cv = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
    Image.fromarray(img_cv).save(output_path)
    print(f"→ Annotated image saved: {output_path}")

def detect_and_annotate_objects(example_image_path, keyword):
    if not os.path.exists(example_image_path):
        print(f"Error: Example image not found → {example_image_path}")
        return

    model, transform, sam, device = initialize_models()

    pos_folder = "query_surfboard"
    neg_folder = "query_background_clutter"

    pos_imgs = load_images_from_folder(pos_folder)
    neg_imgs = load_images_from_folder(neg_folder)

    if not pos_imgs:
        print("Error: No positive images found. Upload to folder and retry.")
        return

    print("Extracting CLIP features from positive examples...")
    pos_emb = np.array([get_vector(img, model, transform, device) for img in pos_imgs])

    print("Extracting CLIP features from negative examples...")
    neg_emb = np.array([get_vector(img, model, transform, device) for img in neg_imgs]) if neg_imgs else np.empty((0, 768))

    # Multimodal text embeddings –
    pos_text = clip.tokenize([f"a clear photo of a {keyword}"]).to(device)
    pos_text_emb = model.encode_text(pos_text).squeeze(0).detach().cpu().numpy()   # ← added .detach()
    pos_emb = np.vstack([pos_emb, pos_text_emb[None]])

    if len(neg_emb) > 0:
        neg_text = clip.tokenize(["a person riding a surfboard"]).to(device)
        neg_text_emb = model.encode_text(neg_text).squeeze(0).detach().cpu().numpy()  # ← added .detach()
        neg_emb = np.vstack([neg_emb, neg_text_emb[None]])
    else:
        neg_emb = np.empty((1, 768))

    print("Computing adjusted embeddings...")
    adjusted = np.array([adjust_embedding(e, pos_emb, neg_emb) for e in pos_emb])

    example_img = Image.open(example_image_path).convert("RGB")
    annotate_image(example_img, adjusted, model, transform, sam, device)

# Run with your actual test image name
detect_and_annotate_objects("SIC_Maui_surfboard_by_model_2021_640_800_mobile.jpg", "surfboard")

Using device: cuda
<All keys matched successfully>
Found 10 images in query_surfboard
Found 8 images in query_background_clutter
Extracting CLIP features from positive examples...
Extracting CLIP features from negative examples...
Computing adjusted embeddings...
Generating SAM masks...
→ Generated 11 masks
→ 2 regions above threshold 0.4
→ Annotated image saved: annotated_output.jpg
