## Cell 1: Install Dependencies
This cell installs all the necessary Python packages for the script to run

In [None]:

print("[1/6] Installing required libraries...")
!pip install -q torch torchvision transformers Pillow gdown
print(" Libraries installed successfully.")

## Cell 2: User Configuration
Please Change the image path and set the Query you want to prompt in this cell

In [None]:

IMAGE_PATH = "Update your Image Path here "                                      #Please Change the path here for the image you want to test

SAMPLE_QUERIES = [
    "Enter your Query here "                                                     #Please Enter your Query here
]

GDRIVE_DINO_ZIP_ID = "1GiwAIi0g--4fkXP-kP7h6umnfpvy5YHZ"
GDRIVE_CLIP_ZIP_ID = "1obrnuPoL3IkXD9tkjW6mDBwYWZTsrjnl"

##Cell 3: Download and Extract Custom Models
This cell downloads the .zip files from the Google Drive links provided in Cell 2. It then extracts them into the Colab environment so they can be used by the script.

In [None]:

print("\\n [2/6] Downloading and extracting custom models from Google Drive...")
import os
import gdown
import zipfile
import sys

try:
    dino_zip_path = "grounding-dino-base.zip"
    clip_zip_path = "clip-vit-base-patch32.zip"

    print("   - Attempting to download Grounding DINO folder...")
    gdown.download(f"https://drive.google.com/uc?id={GDRIVE_DINO_ZIP_ID}", dino_zip_path, quiet=False)

    print("   - Attempting to download CLIP folder...")
    gdown.download(f"https://drive.google.com/uc?id={GDRIVE_CLIP_ZIP_ID}", clip_zip_path, quiet=False)

    if not os.path.exists(dino_zip_path) or not os.path.exists(clip_zip_path):
        raise FileNotFoundError("A required model zip file failed to download.")

except Exception as e:
    print(f"   -  An error occurred during download: {e}")
    print("   - Please check your File IDs and sharing permissions in Cell 2.")
    sys.exit()


try:
    print(f"\\n   - Unzipping {dino_zip_path}...")
    with zipfile.ZipFile(dino_zip_path, 'r') as zip_ref:
        zip_ref.extractall("./")
    print("   -  Grounding DINO extracted successfully.")

    print(f"   - Unzipping {clip_zip_path}...")
    with zipfile.ZipFile(clip_zip_path, 'r') as zip_ref:
        zip_ref.extractall("./")
    print("   - CLIP extracted successfully.")
except Exception as e:
    print(f"   -  An error occurred during extraction: {e}")
    sys.exit()

print("\\n Custom models are ready.")

##Cell 4: Imports and Environment Setup
This cell imports the required libraries, suppresses unnecessary warnings, sets the computation device (GPU if available), and creates the output directory.

In [None]:

print("\\n [3/6] Importing libraries and setting up the environment...")
import torch
import os
from PIL import Image, ImageDraw, ImageFont
from torchvision.ops import nms
import warnings
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, CLIPProcessor, CLIPModel


warnings.filterwarnings('ignore')


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"   - Using device: {DEVICE}")

os.makedirs("final_output", exist_ok=True)
print("   - Created output directory: 'final_output/'")
print(" Setup complete.")

##Cell 5: Core Processing Functions
This cell defines all the core logic: loading the models from the extracted folders, getting candidate boxes with Grounding DINO, refining them with CLIP, and drawing the final output.

In [None]:

print("\\n [4/6] Defining processing functions...")

def load_models():
    print("\\n   - Loading all required models from local folders...")


    local_dino_path = "./grounding-dino-base"
    local_clip_path = "./clip-vit-base-patch32"


    try:
        print(f"   - Loading Grounding DINO model from: {local_dino_path}")
        g_dino_processor = AutoProcessor.from_pretrained(local_dino_path)
        g_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(local_dino_path)
        g_dino_model = g_dino_model.to(DEVICE).eval()
        print("   -  Custom Grounding DINO model loaded successfully.")
    except Exception as e:
        print(f"   - Error loading custom Grounding DINO model: {e}")
        print("   - Please ensure the zip file contained the correct model folder.")
        return None

    try:
        print(f"   - Loading CLIP model from: {local_clip_path}")
        clip_processor = CLIPProcessor.from_pretrained(local_clip_path)
        clip_model = CLIPModel.from_pretrained(local_clip_path)
        clip_model = clip_model.to(DEVICE).eval()
        print("   - Custom CLIP model loaded successfully.")
    except Exception as e:
        print(f"   -  Error loading custom CLIP model: {e}")
        print("   - Please ensure the zip file contained the correct model folder.")
        return None

    return {
        "g_dino": (g_dino_model, g_dino_processor),
        "clip": (clip_model, clip_processor)
    }

def get_candidate_boxes(g_dino_model, g_dino_processor, image, text_prompt, box_threshold=0.25):

    inputs = g_dino_processor(images=image, text=text_prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = g_dino_model(**inputs)

    target_sizes = torch.tensor([image.size[::-1]]).to(DEVICE)
    results = g_dino_processor.post_process_grounded_object_detection(
        outputs,
        target_sizes=target_sizes,
        threshold=box_threshold
    )[0]
    return results["boxes"], results["scores"]

def apply_nms(all_boxes, all_scores, iou_threshold=0.5):

    if all_boxes.numel() == 0:
        return torch.empty((0, 4), device=DEVICE), torch.empty((0,), device=DEVICE)

    indices = nms(all_boxes.to(DEVICE), all_scores.to(DEVICE), iou_threshold)
    print(f"   - NMS: Reduced {len(all_boxes)} boxes to {len(indices)}.")
    return all_boxes[indices], all_scores[indices]

def rerank_with_clip(clip_model, clip_processor, image, boxes, dino_scores, text_prompt, clip_threshold=0.20):

    if boxes.numel() == 0:
        print("   - No candidate boxes to re-rank.")
        return None, None

    crops = [image.crop(box.tolist()) for box in boxes]
    text_inputs = clip_processor(text=[text_prompt], return_tensors="pt", padding=True).to(DEVICE)
    image_inputs = clip_processor(images=crops, return_tensors="pt", padding=True).to(DEVICE)

    with torch.no_grad():
        text_features = clip_model.get_text_features(**text_inputs).float()
        image_features = clip_model.get_image_features(**image_inputs).float()
        text_features /= text_features.norm(dim=-1, keepdim=True)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        clip_scores = (text_features @ image_features.T).squeeze(0)

    combined_scores = (0.5 * dino_scores.to(DEVICE)) + (0.5 * clip_scores)
    best_idx = combined_scores.argmax().item()
    best_clip_score = clip_scores[best_idx].item()

    if best_clip_score < clip_threshold:
        print(f"   - No crop met the CLIP similarity threshold of {clip_threshold}.")
        return None

    print(f"   - Selected best crop with a combined score of: {combined_scores[best_idx].item():.4f}")
    return boxes[best_idx]

def draw_final_box(image, box, text):

    img_draw = image.copy()
    draw = ImageDraw.Draw(img_draw)
    draw.rectangle(box.tolist(), outline="lime", width=5)

    try:
        font = ImageFont.truetype("arial.ttf", 20)
    except IOError:
        font = ImageFont.load_default()
    draw.text((box[0] + 5, box[1] + 5), text, fill="lime", font=font)
    return img_draw

print(" Core functions defined.")

##Cell 6: Main Execution Pipeline
This is the final cell that runs the entire process. It loads the models and then iterates through each of your queries from Cell 2, saving a resulting image for each successful detection.

In [None]:

def run_pipeline():

    try:
        print(f"\\n [5/6] Loading models into memory...")
        models = load_models()
        if models is None:
            return

        print(f"\\n [6/6] Processing image: '{IMAGE_PATH}'...")
        input_image = Image.open(IMAGE_PATH).convert("RGB")


        for query in SAMPLE_QUERIES:
            print(f"\\n{'='*20} PROCESSING QUERY: '{query}' {'='*20}")


            all_boxes, all_scores = get_candidate_boxes(
                models["g_dino"][0], models["g_dino"][1], input_image, query
            )

            if all_boxes.nelement() == 0:
                print("❌ No initial candidate boxes found by Grounding DINO for this query.")
                continue


            merged_boxes, merged_scores = apply_nms(all_boxes, all_scores)

            final_box = rerank_with_clip(
                models["clip"][0], models["clip"][1], input_image, merged_boxes, merged_scores, query
            )

            if final_box is not None:
                query_slug = query.replace(" ", "_").replace("'", "")
                final_image_with_box = draw_final_box(input_image, final_box, query)
                output_path = f"final_output/{query_slug}_result.jpg"
                final_image_with_box.save(output_path)
                print(f"✅ Success! Saved result to '{output_path}'")
            else:
                print(f"❌ Could not determine a final crop for the query: '{query}'")

        print(f"\\n{'='*20} PIPELINE FINISHED {'='*20}")

    except FileNotFoundError:
        print(f"\\n\\n❌ ERROR: Cannot find the image at '{IMAGE_PATH}'.")
        print("   - Please check the file name in 'CELL 2: User Configuration'.")
        print("   - Make sure you have uploaded the image to your Colab session.")
    except Exception as e:
        print(f"\\n\\n❌ An unexpected error occurred: {e}")

if __name__ == '__main__':
    run_pipeline()