# KITTI Dataset EDA — Real-Time Object Detection for AV

Exploratory analysis of the KITTI Object Detection benchmark:
- Class distribution
- Bounding box size statistics
- Occlusion / truncation analysis
- Depth distribution from 3D labels
- Augmentation previews

In [None]:
import os, sys
sys.path.insert(0, '..')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import cv2
from pathlib import Path
from collections import Counter

from data.kitti_loader import KITTIObject, KITTI_CLASSES

plt.rcParams.update({'figure.dpi': 120, 'font.size': 11})
sns.set_theme(style='whitegrid')
print('Setup complete')

## 1. Load KITTI Labels

In [None]:
KITTI_ROOT = Path('../data/kitti')  # adjust if needed
label_dir  = KITTI_ROOT / 'label_2'

records = []
for label_file in sorted(label_dir.glob('*.txt')):
    with open(label_file) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = KITTIObject(line)
                if obj.class_id is not None:
                    x1, y1, x2, y2 = obj.bbox_2d
                    records.append({
                        'class':      obj.type,
                        'truncated':  obj.truncated,
                        'occluded':   obj.occluded,
                        'width':      x2 - x1,
                        'height':     y2 - y1,
                        'area':       (x2-x1)*(y2-y1),
                        'depth':      obj.depth,
                        'alpha':      obj.alpha,
                        'is_valid':   obj.is_valid
                    })
            except Exception:
                pass

df = pd.DataFrame(records)
print(f'Total objects: {len(df)}')
print(f'Valid objects: {df.is_valid.sum()}')
df.head()

## 2. Class Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# All objects
counts = df['class'].value_counts()
colors = sns.color_palette('Set2', len(counts))
axes[0].bar(counts.index, counts.values, color=colors)
axes[0].set_title('Class Distribution — All Objects', fontweight='bold')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=30)
for i, v in enumerate(counts.values):
    axes[0].text(i, v + 20, str(v), ha='center', fontsize=9)

# Valid only
valid_counts = df[df.is_valid]['class'].value_counts()
axes[1].bar(valid_counts.index, valid_counts.values, color=colors)
axes[1].set_title('Class Distribution — Valid Objects Only', fontweight='bold')
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=30)

plt.tight_layout()
plt.savefig('../docs/class_distribution.png', bbox_inches='tight')
plt.show()
print('Class imbalance ratio (Car/Pedestrian):', round(valid_counts.get('Car',0)/max(valid_counts.get('Pedestrian',1),1), 1))

## 3. Bounding Box Size Analysis

In [None]:
valid = df[df.is_valid].copy()

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Width distribution per class
for cls in valid['class'].unique():
    subset = valid[valid['class'] == cls]['width']
    axes[0].hist(subset, bins=40, alpha=0.5, label=cls, density=True)
axes[0].set_title('BBox Width Distribution', fontweight='bold')
axes[0].set_xlabel('Width (px)')
axes[0].legend(fontsize=7)

# Height distribution
for cls in valid['class'].unique():
    subset = valid[valid['class'] == cls]['height']
    axes[1].hist(subset, bins=40, alpha=0.5, label=cls, density=True)
axes[1].set_title('BBox Height Distribution', fontweight='bold')
axes[1].set_xlabel('Height (px)')
axes[1].legend(fontsize=7)

# Area scatter
scatter_cls = ['Car', 'Pedestrian', 'Cyclist']
palette = {'Car': '#1f77b4', 'Pedestrian': '#d62728', 'Cyclist': '#ff7f0e'}
for cls in scatter_cls:
    s = valid[valid['class'] == cls]
    axes[2].scatter(s['width'], s['height'], alpha=0.15, s=5,
                    label=cls, color=palette[cls])
axes[2].set_title('Width vs Height (Car/Ped/Cyclist)', fontweight='bold')
axes[2].set_xlabel('Width (px)')
axes[2].set_ylabel('Height (px)')
axes[2].legend()

plt.tight_layout()
plt.savefig('../docs/bbox_analysis.png', bbox_inches='tight')
plt.show()

## 4. Depth Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Depth histogram per class
for cls in ['Car', 'Pedestrian', 'Cyclist']:
    d = valid[valid['class'] == cls]['depth'].clip(0, 80)
    axes[0].hist(d, bins=50, alpha=0.55, label=cls, density=True)
axes[0].axvline(5,  color='r', ls='--', lw=1.2, label='CRITICAL (<5m)')
axes[0].axvline(15, color='orange', ls='--', lw=1.2, label='WARNING (<15m)')
axes[0].axvline(30, color='gold', ls='--', lw=1.2, label='CAUTION (<30m)')
axes[0].set_title('Depth Distribution by Class', fontweight='bold')
axes[0].set_xlabel('Distance from Camera (m)')
axes[0].legend(fontsize=8)

# Depth vs BBox area
for cls, color in palette.items():
    s = valid[valid['class'] == cls]
    axes[1].scatter(s['depth'].clip(0,80), np.log1p(s['area']),
                    alpha=0.1, s=5, color=color, label=cls)
axes[1].set_title('Depth vs log(BBox Area)', fontweight='bold')
axes[1].set_xlabel('Depth (m)')
axes[1].set_ylabel('log(BBox Area)')
axes[1].legend()

plt.tight_layout()
plt.savefig('../docs/depth_analysis.png', bbox_inches='tight')
plt.show()

# Safety zone stats
for zone, thresh in [('CRITICAL (<5m)', 5), ('WARNING (<15m)', 15), ('CAUTION (<30m)', 30)]:
    n = (valid['depth'] < thresh).sum()
    print(f'{zone}: {n} objects ({100*n/len(valid):.1f}%)')

## 5. Occlusion & Truncation Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

occ_labels = {0: 'Fully Visible', 1: 'Partly Occluded', 2: 'Largely Occluded', 3: 'Unknown'}
occ_counts = df['occluded'].value_counts().sort_index()
axes[0].bar([occ_labels.get(i, i) for i in occ_counts.index],
            occ_counts.values, color=sns.color_palette('Blues_r', 4))
axes[0].set_title('Occlusion Level Distribution', fontweight='bold')
axes[0].tick_params(axis='x', rotation=20)

trunc_bins = pd.cut(df['truncated'], bins=[0, 0.15, 0.4, 0.8, 1.0],
                    labels=['None', 'Low', 'Medium', 'High'])
trunc_counts = trunc_bins.value_counts().sort_index()
axes[1].bar(trunc_counts.index, trunc_counts.values,
            color=sns.color_palette('Oranges_r', 4))
axes[1].set_title('Truncation Level Distribution', fontweight='bold')

plt.tight_layout()
plt.savefig('../docs/occlusion_analysis.png', bbox_inches='tight')
plt.show()

## 6. Summary Statistics Table

In [None]:
summary = valid.groupby('class').agg(
    count=('class', 'count'),
    avg_width=('width', 'mean'),
    avg_height=('height', 'mean'),
    avg_depth=('depth', 'mean'),
    median_depth=('depth', 'median'),
    min_depth=('depth', 'min'),
    max_depth=('depth', 'max'),
).round(1).sort_values('count', ascending=False)

print('\n=== KITTI Dataset Summary (Valid Objects) ===')
print(summary.to_string())
summary