In [None]:
import os
import zipfile
import json
import re
import yaml
import requests
import torch
import numpy as np
import supervision as sv
import matplotlib.pyplot as plt
import urllib.request

from PIL import Image
from io import BytesIO
from tqdm import tqdm
from sklearn.metrics import average_precision_score
from collections import defaultdict
from transformers import AutoProcessor, AutoModelForCausalLM 

In [None]:
# Import Lvis minival dataset.

with open("../Dataset/lvis_minival_only.json", "r") as f_json:
    lvis_minival = json.load(f_json)
categories = lvis_minival["categories"]
annotations = lvis_minival["annotations"]
images = lvis_minival["images"]

# Reconstruct filtered LVIS structure
lvis_val = {
    "images": images,
    "annotations": annotations,
    "categories": categories
}

print(f"Images : {len(images)}")
print(f"Annotations : {len(annotations)}")
print(f"Categories : {len(categories)}")

In [None]:
# Predict AP50 score for each category across the LVIs minival dataset

def get_category_mapping(data):
    """
        Creates a mapping from category ID to category name.

        Args:
            data (dict): JSON Dataset.

        Returns:
            dict: Mapping from category ID (int) to category name (str).
    """
    return {cat['id']: cat['name'] for cat in data['categories']}

def get_annotations_by_image(data, image_id):
    """
        Retrieves all annotations associated with a specific image.

        Args:
            data (dict): JSON Dataset.
            image_id (int): ID of the image.

        Returns:
            list: List of annotation dicts for the specified image.
    """
    return [ann for ann in data['annotations'] if ann['image_id'] == image_id]

def group_boxes_by_category(annotations, id_to_name):
    """
        Groups bounding boxes by their category names.

        Args:
            annotations (list): List of annotation dicts for a single image.
            id_to_name (dict): Mapping from category ID to category name.

        Returns:
            dict: Dictionary mapping category name to a list of bounding boxes.
    """
    category_to_boxes = defaultdict(list)
    for ann in annotations:
        category_name = id_to_name[ann['category_id']]
        category_to_boxes[category_name].append(ann['bbox'])
    return category_to_boxes

def compute_iou(boxA, boxB):
    """
        Computes the Intersection over Union (IoU) between two bounding boxes.

        Args:
            boxA (list): First box in XYXY format [x1, y1, x2, y2].
            boxB (list): Second box in XYXY format [x1, y1, x2, y2].

        Returns:
            float: IoU score between the two boxes.
    """
    ix1 = max(boxA[0], boxB[0])
    iy1 = max(boxA[1], boxB[1])
    ix2 = min(boxA[2], boxB[2])
    iy2 = min(boxA[3], boxB[3])
    inter = max(ix2 - ix1, 0) * max(iy2 - iy1, 0)

    areaA = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    areaB = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    union = areaA + areaB - inter
    return inter / union if union > 0 else 0

def convert_bbox_xywh_to_xyxy(bbox):
    """
        Converts bounding box from [x, y, width, height] to [x1, y1, x2, y2].

        Args:
            bbox (list): Bounding box in XYWH format.

        Returns:
            list: Bounding box in XYXY format.
    """
    x, y, w, h = bbox
    return [x, y, x + w, y + h]

def compute_ap(gt_boxes, pred_boxes, pred_scores, iou_threshold=0.5):
    """
        Computes Average Precision (AP) at a given IoU threshold for a single category.

        Args:
            gt_boxes (list): Ground-truth bounding boxes in XYWH format.
            pred_boxes (list): Predicted boxes in XYXY format.
            pred_scores (list): Confidence scores for each predicted box.
            iou_threshold (float): IoU threshold to consider a prediction correct.

        Returns:
            float: Average Precision (AP) score.
    """
    if not pred_boxes:
        return 0.0

    ious = []
    for pred_box in pred_boxes:
        iou_max = 0
        for gt_box in gt_boxes:
            iou = compute_iou(pred_box, convert_bbox_xywh_to_xyxy(gt_box))
            iou_max = max(iou_max, iou)
        ious.append(iou_max)

    y_true = [1 if iou >= iou_threshold else 0 for iou in ious]
    return average_precision_score(y_true, pred_scores) if any(y_true) else 0.0

def process_image(image, data, model, id_to_name, thresholds, ap50_scores):
    """
        Runs inference on a single image, computes AP scores across all thresholds
        for each ground-truth category, and updates the AP score dictionary.

        Args:
            image (dict): Image metadata dict from JSON dataset.
            data (dict): JSON dataset.
            model (Grounding DINO): Initialized detection model.
            id_to_name (dict): Category ID to name mapping.
            thresholds (list): List of confidence thresholds to evaluate.
            ap50_scores (defaultdict): Nested dict to store AP scores.
    """
    response = requests.get(image['coco_url'])
    img = Image.open(BytesIO(response.content)).convert("RGB")

    annotations = get_annotations_by_image(data, image['id'])
    category_to_boxes = group_boxes_by_category(annotations, id_to_name)
    categories = list(category_to_boxes.keys())
        
    prompt = "<OD>"

    inputs = processor(text=prompt, images=img, return_tensors="pt").to(torch.float32)
    with torch.no_grad():
        outputs = model(**inputs)
    result = processor.post_process_grounded_object_detection(outputs, threshold=0.001, target_sizes=[(img.height, img.width)])[0]

    detections = defaultdict(lambda: ([], []))
    for label, score, box in zip(result['text_labels'], result['scores'], result['boxes']):
        if label != " ":
            if '_' in label:
                label = re.sub(r'\s*_\s*', '_', label)
            detections[label][0].append(score)
            detections[label][1].append(box)

    for category_name, gt_boxes in category_to_boxes.items():
        for threshold in thresholds:
            pred_boxes = []
            pred_scores = []

            for label in detections.keys():
                if label == category_name:
                    scores, boxes = detections[label]
                    for score, box in zip(scores, boxes):
                        if score >= threshold:
                            pred_boxes.append(box)
                            pred_scores.append(score)

            ap = compute_ap(gt_boxes, pred_boxes, pred_scores)
            ap50_scores[category_name][threshold].append(ap)

def evaluate_dataset(data, model, max_images=5):
    """
        Evaluates a detection model on a subset of the JSON dataset by computing
        AP@0.5 scores per category across multiple confidence thresholds.

        Args:
            data (dict): JSON dataset.
            model (method): Pretrained Grounding DINO model.
            max_images (int): Maximum number of images to evaluate.

        Returns:
            dict: Nested dictionary {category_name: {threshold: [AP scores]}}.
    """
    ap50_scores = defaultdict(lambda: defaultdict(list))
    id_to_name = get_category_mapping(data)
    thresholds = [round(x, 1) for x in np.arange(0.0, 1.01, 0.1)]

    for image in tqdm(data['images'][:max_images]):
        process_image(image, data, model, id_to_name, thresholds, ap50_scores)

    return ap50_scores

model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=torch.float32, trust_remote_code=True)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

ap50_scores = evaluate_dataset(lvis_val, model, max_images=50)