In [1]:
# Install required packages
!pip install -q supervision
!pip install -q git+https://github.com/facebookresearch/sam2.git
!pip install -q git+https://github.com/IDEA-Research/GroundingDINO.git

# Download model weights
!mkdir -p checkpoints
!wget -q -P checkpoints https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt
!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
from groundingdino.util.inference import load_model, predict
import supervision as sv



In [3]:
!pip install git+https://github.com/IDEA-Research/GroundingDINO.git@main


Collecting git+https://github.com/IDEA-Research/GroundingDINO.git@main
  Cloning https://github.com/IDEA-Research/GroundingDINO.git (to revision main) to /tmp/pip-req-build-ml4p21en
  Running command git clone --filter=blob:none --quiet https://github.com/IDEA-Research/GroundingDINO.git /tmp/pip-req-build-ml4p21en
  Resolved https://github.com/IDEA-Research/GroundingDINO.git to commit 856dde20aee659246248e20734ef9ba5214f5e44
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [7]:
import torch

# Load SAM 2
sam2_checkpoint = "checkpoints/sam2_hiera_large.pt"
model_cfg = "sam2_hiera_l.yaml"
# Explicitly set device to 'cpu'
sam2_model = build_sam2(model_cfg, sam2_checkpoint, device='cpu')
sam2_predictor = SAM2ImagePredictor(sam2_model)

# Load GroundingDINO
grounding_model = load_model(
    "groundingdino_swint_ogc.pth",
    "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
)
# Explicitly set device to 'cpu' for GroundingDINO model as well
grounding_model.to('cpu')

OSError: Only py/yml/yaml/json type are supported now!

In [8]:
def segment_image_with_text(image_path, text_prompt, box_threshold=0.35, text_threshold=0.25):
    """
    Segments an object in an image based on text prompt

    Args:
        image_path: Path to image file
        text_prompt: Text description of object (e.g., "cat", "person")
        box_threshold: Confidence threshold for detection
        text_threshold: Text matching threshold
    """

    # 1. Load image
    image = Image.open(image_path).convert("RGB")
    image_np = np.array(image)

    # 2. Use GroundingDINO to detect object from text
    boxes, logits, phrases = predict(
        model=grounding_model,
        image=image,
        caption=text_prompt,
        box_threshold=box_threshold,
        text_threshold=text_threshold
    )

    # 3. Convert boxes to SAM 2 format
    h, w = image_np.shape[:2]
    boxes_xyxy = boxes * torch.Tensor([w, h, w, h])
    input_boxes = boxes_xyxy.cpu().numpy()

    # 4. Use SAM 2 to generate precise masks
    sam2_predictor.set_image(image_np)
    masks, scores, _ = sam2_predictor.predict(
        point_coords=None,
        point_labels=None,
        box=input_boxes,
        multimask_output=False
    )

    # 5. Visualize results
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))

    # Original image
    axes[0].imshow(image_np)
    axes[0].set_title("Original Image")
    axes[0].axis('off')

    # Detection boxes
    axes[1].imshow(image_np)
    for box in input_boxes:
        x1, y1, x2, y2 = box
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                            fill=False, color='red', linewidth=2)
        axes[1].add_patch(rect)
    axes[1].set_title(f"Detected: '{text_prompt}'")
    axes[1].axis('off')

    # Segmentation mask overlay
    axes[2].imshow(image_np)
    if len(masks) > 0:
        mask = masks[0]
        colored_mask = np.zeros_like(image_np)
        colored_mask[mask] = [0, 255, 0]  # Green mask
        axes[2].imshow(colored_mask, alpha=0.5)
    axes[2].set_title("Segmentation Mask")
    axes[2].axis('off')

    plt.tight_layout()
    plt.show()

    return masks, boxes, phrases

In [10]:
# Upload or use a sample image
from google.colab import files

# # Option 1: Upload your own image
# uploaded = files.upload()
# image_path = list(uploaded.keys())[0]

# Option 2: Or download a sample
!wget -O sample.jpg "https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba"
image_path = "sample.jpg"

# Run segmentation
text_prompt = "cat"  # Change this to segment different objects
masks, boxes, phrases = segment_image_with_text(image_path, text_prompt)

print(f"Found {len(masks)} object(s) matching '{text_prompt}'")

--2025-10-04 09:07:36--  https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba
Resolving images.unsplash.com (images.unsplash.com)... 151.101.2.208, 151.101.66.208, 151.101.130.208, ...
Connecting to images.unsplash.com (images.unsplash.com)|151.101.2.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2017802 (1.9M) [image/jpeg]
Saving to: ‘sample.jpg’


2025-10-04 09:07:36 (32.5 MB/s) - ‘sample.jpg’ saved [2017802/2017802]



NameError: name 'grounding_model' is not defined

In [None]:
# Test different objects
test_prompts = ["person", "car", "dog", "tree", "building"]

for prompt in test_prompts:
    print(f"\nSearching for: {prompt}")
    try:
        masks, _, _ = segment_image_with_text(image_path, prompt)
        print(f"Success! Found {len(masks)} instance(s)")
    except:
        print(f"No {prompt} found in image")

In [11]:
# Install video processing
!pip install -q opencv-python-headless

def segment_video_with_text(video_path, text_prompt, output_path="output.mp4"):
    """
    Segments object across video frames using SAM 2 tracking
    """
    import cv2
    from sam2.build_sam import build_sam2_video_predictor

    # Initialize video predictor
    video_predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint)

    # Process video frame by frame
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # Get first frame detection
    ret, first_frame = cap.read()
    # ... (detect object with GroundingDINO in first frame)
    # ... (propagate mask through video with SAM 2)

    print(f"Video segmentation complete: {output_path}")