In [1]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2




In [2]:
import os 

In [3]:
os.listdir()

['.asset',
 '.git',
 '.gitignore',
 'demo',
 'Dockerfile',
 'docker_test.py',
 'environment.yaml',
 'groundingdino',
 'groundingdino.egg-info',
 'LICENSE',
 'README.md',
 'requirements.txt',
 'Screenshot 2025-11-15 212217.png',
 'setup.py',
 'test.ipynb',
 'weights']

In [4]:

model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights\\groundingdino_swint_ogc.pth")




final text_encoder_type: bert-base-uncased


In [6]:
IMAGE_PATH = "Screenshot 2025-11-15 212217.png"
TEXT_PROMPT = "eyes"
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("annotated_image.jpg", annotated_frame)

True

In [9]:
import torch

In [10]:
h, w, _ = image_source.shape
boxes = boxes * torch.Tensor([w, h, w, h])

In [11]:
boxes

tensor([[165.5977, 182.4598,  39.6376,  17.1255],
        [244.6708, 184.5461,  43.7627,  17.2682]])

In [8]:
phrases

['eyes', 'eyes']

In [None]:
import os
import shutil
from tqdm import tqdm

def create_yolo_dataset(image_dir, output_dir, class_labels, box_threshold=0.35, text_threshold=0.25):
    """
    Generates a YOLO format dataset from images using GroundingDINO.
    
    Args:
        image_dir (str): Directory containing source images.
        output_dir (str): Directory to save the YOLO dataset (images/ and labels/).
        class_labels (list): List of class names to detect.
        box_threshold (float): Confidence threshold for bounding boxes.
        text_threshold (float): Confidence threshold for text matching.
    """
    # Create directories
    images_out = os.path.join(output_dir, 'images')
    labels_out = os.path.join(output_dir, 'labels')
    os.makedirs(images_out, exist_ok=True)
    os.makedirs(labels_out, exist_ok=True)

    # Create data.yaml for YOLO training
    yaml_content = f"""train: ../images
val: ../images
nc: {len(class_labels)}
names: {class_labels}"""
    
    with open(os.path.join(output_dir, 'data.yaml'), 'w') as f:
        f.write(yaml_content)

    # Construct prompt for GroundingDINO
    text_prompt = " . ".join(class_labels) + " ."
    print(f"Using prompt: {text_prompt}")

    # Process images
    image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    for img_file in tqdm(image_files, desc="Processing Images"):
        img_path = os.path.join(image_dir, img_file)
        
        try:
            # Load and predict
            image_source, image = load_image(img_path)
            boxes, logits, phrases = predict(
                model=model,
                image=image,
                caption=text_prompt,
                box_threshold=box_threshold,
                text_threshold=text_threshold
            )
            
            # Prepare label content
            label_lines = []
            for box, phrase in zip(boxes, phrases):
                # Find class index. 
                # GroundingDINO returns phrases that might be slightly different from the prompt.
                # We match the phrase to the closest class label.
                class_id = -1
                for idx, label in enumerate(class_labels):
                    # Check if the detected phrase contains the label or vice versa
                    if label.lower() in phrase.lower() or phrase.lower() in label.lower():
                        class_id = idx
                        break
                
                if class_id != -1:
                    # box is already in normalized (cx, cy, w, h) format which YOLO uses
                    line = f"{class_id} {box[0]:.6f} {box[1]:.6f} {box[2]:.6f} {box[3]:.6f}"
                    label_lines.append(line)
            
            # Save label file and copy image only if detections are found (optional: remove 'if label_lines' to keep all)
            if label_lines:
                label_file = os.path.splitext(img_file)[0] + ".txt"
                with open(os.path.join(labels_out, label_file), 'w') as f:
                    f.write("\n".join(label_lines))
                
                # Copy image to output directory
                shutil.copy(img_path, os.path.join(images_out, img_file))
                
        except Exception as e:
            print(f"Error processing {img_file}: {e}")

    print(f"Dataset created at {output_dir}")

# Example Usage:
# Define your input directory and classes
# input_images_dir = "path/to/your/images"
# output_dataset_dir = "yolo_dataset"
# classes = ["eyes", "nose", "mouth"]

# create_yolo_dataset(input_images_dir, output_dataset_dir, classes)