# Annotator

### important:
Requires python env with python >= 10!

The purpose of this script is to annotate images of drones for finetuning yolo.

This involeves the following steps:
- read raw images
- detect drones
- extract the most probable bounding box (if any)
- write the input image to output/images and the labels in yolo format in output/labels

In [1]:
# dependencies

!pip install autodistill autodistill-yolov5 autodistill-grounding-dino supervision opencv-python



In [2]:
# imports

import os
import numpy as np
import cv2
import supervision as sv

from pathlib import Path
from autodistill_grounding_dino import GroundingDINO
from autodistill.detection import CaptionOntology

In [3]:
# blur corners to improve annotation 
# takes 1 minute approx

def blur_corners(image_path, output_dir):
    # Load the image
    image = cv2.imread(image_path)

    # Get image dimensions
    height, width = image.shape[:2]

    # Define regions to blur (lower left and lower right corners)
    mask = np.zeros((height, width), dtype=np.uint8)

    # Updated coordinates for the lower left and lower right corners
    lower_left = np.array([[0, height], [0, int(height * 0.6)], [int(width * 0.2), height]])
    lower_right = np.array([[width, height], [width, int(height * 0.6)], [int(width * 0.8), height]])

    # Fill the masks for the regions
    cv2.fillPoly(mask, [lower_left], 255)
    cv2.fillPoly(mask, [lower_right], 255)

    # Blur the image
    blurred_image = cv2.GaussianBlur(image, (51, 51), 0)

    # Apply the mask to the original image
    blurred = np.where(mask[..., None] == 255, blurred_image, image)

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save the blurred image
    output_path = os.path.join(output_dir, os.path.basename(image_path))
    cv2.imwrite(output_path, blurred)

def blur_corners_in_directory(input_dir, output_dir):
    # List all files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith((".png", ".jpg", ".jpeg")):
            image_path = os.path.join(input_dir, filename)
            blur_corners(image_path, output_dir)

# Example usage
input_directory = "unlabeled_images"
output_directory = "blurred_images"

# process all unlabeled images
blur_corners_in_directory(input_directory, output_directory)

In [14]:
# blurred images paths

IMG_DIR = "blurred_images"
imgs_paths = [os.path.join(IMG_DIR, img) for img in os.listdir(IMG_DIR)]
imgs_paths[:3]

['blurred_images/frame000000.jpg',
 'blurred_images/frame000001.jpg',
 'blurred_images/frame000002.jpg']

In [15]:
# directories for saving images

import shutil

# output paths
output_orig_image_dir = Path('dataset/output/images')
output_annotated_image_dir = Path('dataset/output/annotated_images')
output_label_dir = Path('dataset/output/labels')
output_dir = Path('dataset/raw')

# remove output dir
shutil.rmtree(output_dir, ignore_errors=True)

# recreate
output_dir.mkdir(parents=True, exist_ok=True)
output_orig_image_dir.mkdir(parents=True, exist_ok=True)
output_annotated_image_dir.mkdir(parents=True, exist_ok=True)
output_label_dir.mkdir(parents=True, exist_ok=True)

In [16]:
# model hyperparameters

BOX_THRESHOLD = 0.25
TEXT_THRESHOLD = 0.25

# the training images contain some bs, but it's not much

In [17]:
# init model

base_model = GroundingDINO(
    ontology=CaptionOntology({"drone": "drone"}), 
    box_threshold=BOX_THRESHOLD, 
    text_threshold=TEXT_THRESHOLD
)

trying to load grounding dino directly
final text_encoder_type: bert-base-uncased


In [18]:
# Process each image

process_counter = 0

for img in imgs_paths:
    img_path = Path(img)  # Convert img to Path object
    predictions = base_model.predict(str(img_path))
    process_counter += 1

    if len(predictions.xyxy) > 0:  # Proceed only if there are predictions (drones detected)
        print(f"Processed: {process_counter}/{len(imgs_paths)}")

        image = cv2.imread(str(img_path))

        # Save the original image
        output_orig_image_path = output_orig_image_dir / f"{img_path.stem}.png"
        cv2.imwrite(str(output_orig_image_path), image)

        # Find the index of the prediction with the highest confidence
        highest_confidence_index = np.argmax(predictions.confidence)

        # Extract the bounding box with the highest confidence
        x1, y1, x2, y2 = predictions.xyxy[highest_confidence_index]
        confidence = predictions.confidence[highest_confidence_index]

        # Create a Detections object with only the highest confidence box
        detections = sv.Detections(
            xyxy=np.array([[x1, y1, x2, y2]], dtype=float),  # Bounding box coordinates as floats
            confidence=np.array([confidence], dtype=float),  # Confidence score as float
            class_id=np.array([0], dtype=int)  # Assuming class_id for drone is 0
        )

        # Annotate the image
        box_annotator = sv.BoxAnnotator()
        labels = [f"drone {confidence:0.2f}"]
        annotated_image = box_annotator.annotate(scene=image, detections=detections, labels=labels)

        # Save the annotated image
        output_annotated_image_path = output_annotated_image_dir / f"{img_path.stem}_annotated.png"
        cv2.imwrite(str(output_annotated_image_path), annotated_image)

        # Extract bounding box and save it to a text file in YOLO format
        output_label_path = output_label_dir / f"{img_path.stem}.txt"
        with open(output_label_path, 'w') as f:
            # Calculate center, width, and height
            x_center = (x1 + x2) / 2
            y_center = (y1 + y2) / 2
            width = x2 - x1
            height = y2 - y1

            # YOLO format: class_id x_center y_center width height
            class_id = 0  # Assuming class_id for drone is 0

            # Normalize the values by the image size (to be used by YOLO)
            img_height, img_width = image.shape[:2]
            x_center /= img_width
            y_center /= img_height
            width /= img_width
            height /= img_height

            # Write to the label file
            f.write(f"{class_id} {x_center} {y_center} {width} {height}\n")

Processed: 1/1472
Processed: 2/1472
Processed: 3/1472
Processed: 4/1472
Processed: 5/1472
Processed: 6/1472
Processed: 7/1472
Processed: 8/1472
Processed: 9/1472
Processed: 10/1472
Processed: 11/1472
Processed: 12/1472
Processed: 13/1472
Processed: 14/1472
Processed: 15/1472
Processed: 16/1472
Processed: 17/1472
Processed: 18/1472
Processed: 19/1472
Processed: 20/1472
Processed: 21/1472
Processed: 22/1472
Processed: 23/1472
Processed: 24/1472
Processed: 25/1472
Processed: 26/1472
Processed: 27/1472
Processed: 28/1472
Processed: 29/1472
Processed: 30/1472
Processed: 31/1472
Processed: 32/1472
Processed: 33/1472
Processed: 34/1472
Processed: 35/1472
Processed: 36/1472
Processed: 37/1472
Processed: 38/1472
Processed: 39/1472
Processed: 40/1472
Processed: 41/1472
Processed: 42/1472
Processed: 43/1472
Processed: 44/1472
Processed: 45/1472
Processed: 46/1472
Processed: 47/1472
Processed: 48/1472
Processed: 49/1472
Processed: 50/1472
Processed: 51/1472
Processed: 52/1472
Processed: 53/1472
Pr