# DOTA-VLM Tutorial
## Automated Annotation of Aerial Imagery Using Vision-Language Models

This notebook demonstrates how to use the DOTA-VLM pipeline to:
1. Detect objects in aerial images
2. Crop detected objects
3. Generate rich annotations using Vision-Language Models
4. Merge and export to COCO format

## Setup

In [None]:
import sys
import json
import torch
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image

# Add project to path
sys.path.append('..')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## Step 1: Object Detection

In [None]:
from detection.run_detector import YOLOOBBDetector

# Initialize detector
detector = YOLOOBBDetector(
    model_path='../checkpoints/yolo_obb.pt',
    conf_threshold=0.3
)

# Run detection on a sample image
image_path = '../data/DOTA/images/sample.png'
detections = detector.detect(image_path)

print(f"Detected {len(detections)} objects")
for det in detections[:5]:  # Show first 5
    print(f"  {det['class_name']}: {det['score']:.3f}")

## Step 2: Visualize Detections

In [None]:
from tools.visualize import visualize_detections

# Visualize detections
viz_image = visualize_detections(
    image_path=image_path,
    detections=detections,
    draw_labels=True,
    color_by_class=True
)

# Display
plt.figure(figsize=(15, 10))
plt.imshow(viz_image[:, :, ::-1])  # BGR to RGB
plt.axis('off')
plt.title(f'Detections ({len(detections)} objects)')
plt.show()

## Step 3: Crop Objects

In [None]:
from tools.crop_objects import crop_detections

# Crop objects
crops_metadata = crop_detections(
    image_path=image_path,
    detections=detections,
    output_dir='../crops',
    image_id='sample_001',
    padding=10
)

print(f"Created {len(crops_metadata)} crops")

# Display some crops
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.flatten()

for idx, (ax, crop_info) in enumerate(zip(axes, crops_metadata[:10])):
    crop_path = Path('../crops') / crop_info['crop_path']
    if crop_path.exists():
        img = Image.open(crop_path)
        ax.imshow(img)
        ax.set_title(f"{crop_info['class_name']}\n{crop_info['score']:.2f}", fontsize=8)
    ax.axis('off')

plt.tight_layout()
plt.show()

## Step 4: Generate VLM Annotations

In [None]:
from vlm.generate_annotations import LLaVAAnnotator, PromptTemplates

# Initialize VLM
vlm = LLaVAAnnotator(model_name='llava-hf/llava-1.5-7b-hf')

# Annotate first crop as example
sample_crop = crops_metadata[0]
crop_path = Path('../crops') / sample_crop['crop_path']

# Generate attribute description
prompt = PromptTemplates.object_attributes()
attributes = vlm.generate_caption(str(crop_path), prompt)

print(f"\nObject: {sample_crop['class_name']}")
print(f"Detection Score: {sample_crop['score']:.3f}")
print(f"\nVLM Attributes:\n{attributes}")

# Display the crop
img = Image.open(crop_path)
plt.figure(figsize=(6, 6))
plt.imshow(img)
plt.title(f"{sample_crop['class_name']} - VLM Annotated")
plt.axis('off')
plt.show()

## Step 5: Batch Annotation

In [None]:
from vlm.generate_annotations import generate_annotations_for_crop
from tqdm.notebook import tqdm

# Annotate all crops (limited to first 20 for demo)
all_annotations = []

for crop_info in tqdm(crops_metadata[:20], desc="Annotating"):
    crop_path = Path('../crops') / crop_info['crop_path']
    
    if not crop_path.exists():
        continue
    
    # Generate annotations
    annotations = generate_annotations_for_crop(
        vlm=vlm,
        crop_path=str(crop_path),
        crop_metadata=crop_info,
        annotation_types=['attributes', 'verification']
    )
    
    all_annotations.append(annotations)

print(f"\nGenerated {len(all_annotations)} annotations")

# Display some results
print("\nSample Annotations:")
print("="*60)
for ann in all_annotations[:3]:
    print(f"\nClass: {ann['class_name']}")
    print(f"Attributes: {ann['attributes'][:150]}...")
    print(f"Verification: {ann['class_verification'][:100]}...")

## Step 6: Merge and Export

In [None]:
from tools.merge_annotations import COCOAnnotationBuilder

# Create COCO-style annotations
builder = COCOAnnotationBuilder()

# Add categories
categories = list(set(det['class_name'] for det in detections))
builder.add_categories(categories)

# Add image
image_id = builder.add_image(
    image_id=0,
    filename='sample.png',
    width=1024,
    height=1024
)

# Add annotations
for idx, det in enumerate(detections):
    # Find corresponding VLM annotation
    vlm_metadata = next(
        (ann for ann in all_annotations if ann['object_id'] == idx),
        None
    )
    
    builder.add_annotation(
        ann_id=idx,
        image_id=image_id,
        category_name=det['class_name'],
        bbox=det['bbox'],
        detection_score=det['score'],
        vlm_metadata=vlm_metadata
    )

# Build final JSON
coco_data = builder.build()

# Save
output_path = '../outputs/dota_vlm_sample.json'
with open(output_path, 'w') as f:
    json.dump(coco_data, f, indent=2)

print(f"✓ Saved DOTA-VLM annotations to {output_path}")
print(f"  Images: {len(coco_data['images'])}")
print(f"  Annotations: {len(coco_data['annotations'])}")
print(f"  Categories: {len(coco_data['categories'])}")

## Step 7: Analyze Results

In [None]:
import pandas as pd

# Create DataFrame for analysis
df_data = []
for ann in coco_data['annotations']:
    cat_name = next(c['name'] for c in coco_data['categories'] if c['id'] == ann['category_id'])
    
    row = {
        'category': cat_name,
        'detection_score': ann['detection_score'],
        'has_vlm': 'vlm_metadata' in ann,
        'area': ann['area']
    }
    
    if 'vlm_metadata' in ann:
        row['attributes'] = ann['vlm_metadata'].get('attributes', '')
    
    df_data.append(row)

df = pd.DataFrame(df_data)

# Display statistics
print("Category Distribution:")
print(df['category'].value_counts())
print("\nVLM Coverage:")
print(df['has_vlm'].value_counts())
print("\nAverage Detection Score by Category:")
print(df.groupby('category')['detection_score'].mean().sort_values(ascending=False))

## Conclusion

You've successfully:
1. ✅ Detected objects with oriented bounding boxes
2. ✅ Cropped object patches
3. ✅ Generated rich VLM annotations
4. ✅ Exported to COCO format

Next steps:
- Scale to full DOTA dataset
- Experiment with different VLM models
- Fine-tune prompts for better annotations
- Train downstream models with enriched annotations