# 02 — Class Distribution Analysis

**Story 1.2** — Compute per-class point counts and per-scene object statistics to calibrate loss weights and DBSCAN parameters.

**Acceptance criteria:**
- Per-class point count table across all frames
- Per-scene breakdown showing which classes appear in which scenes
- Background vs labeled point ratios
- Loss weights calibrated from actual class frequencies
- Validation scene selected (most diverse)

## 0. Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

DRIVE_BASE = "/content/drive/MyDrive/airbus_hackathon"
DATA_DIR = f"{DRIVE_BASE}/data"
PROJECT_DIR = f"{DRIVE_BASE}/project"

sys.path.insert(0, os.path.join(PROJECT_DIR, 'src'))
sys.path.insert(0, os.path.join(PROJECT_DIR, 'airbus_hackathon_toolkit'))

from lidar_utils import load_h5_data, get_unique_poses, filter_by_pose, spherical_to_local_cartesian
from config import CLASS_COLORS, CLASS_NAMES, NUM_CLASSES, SCENE_FILES

print("Imports OK")

In [None]:
def map_rgb_to_class(df_frame):
    """Map RGB labels to class IDs. 0 = background."""
    class_ids = np.zeros(len(df_frame), dtype=np.int64)
    for (r, g, b), class_id in CLASS_COLORS.items():
        mask = (
            (df_frame["r"].values == r) &
            (df_frame["g"].values == g) &
            (df_frame["b"].values == b)
        )
        class_ids[mask] = class_id
    return class_ids

## 1. Scan All 998 Frames — Per-Class Point Counts

This takes a few minutes. We process scene by scene, frame by frame.

In [None]:
from tqdm import tqdm

# Collect stats: one row per frame
frame_stats = []

for scene_file in SCENE_FILES:
    path = os.path.join(DATA_DIR, scene_file)
    scene_name = scene_file.replace(".h5", "")
    
    print(f"\nProcessing {scene_file}...")
    df = load_h5_data(path)
    poses = get_unique_poses(df)
    
    for idx in tqdm(range(len(poses)), desc=scene_name):
        frame = filter_by_pose(df, poses.iloc[idx])
        
        # Filter valid points
        valid = frame[frame["distance_cm"] > 0]
        
        # Map to classes
        class_ids = map_rgb_to_class(valid)
        
        # Count per class
        row = {
            "scene": scene_name,
            "frame_idx": idx,
            "total_points": len(valid),
            "ego_x": poses.iloc[idx]["ego_x"],
            "ego_y": poses.iloc[idx]["ego_y"],
            "ego_z": poses.iloc[idx]["ego_z"],
            "ego_yaw": poses.iloc[idx]["ego_yaw"],
        }
        for cid in range(NUM_CLASSES):
            row[CLASS_NAMES[cid]] = int((class_ids == cid).sum())
        
        frame_stats.append(row)
    
    # Free memory
    del df

stats_df = pd.DataFrame(frame_stats)
print(f"\nDone! {len(stats_df)} frames analyzed.")

## 2. Global Class Distribution

In [None]:
# Total points per class across all frames
class_cols = [CLASS_NAMES[i] for i in range(NUM_CLASSES)]
totals = stats_df[class_cols].sum()
grand_total = totals.sum()

print("=" * 60)
print("GLOBAL CLASS DISTRIBUTION (all 998 frames)")
print("=" * 60)
for cname in class_cols:
    count = totals[cname]
    pct = count / grand_total * 100
    print(f"  {cname:15s}: {count:>12,} points ({pct:>6.3f}%)")
print(f"  {'TOTAL':15s}: {grand_total:>12,} points")

# Obstacle vs background ratio
bg = totals["background"]
obs = grand_total - bg
print(f"\nBackground: {bg/grand_total*100:.2f}%")
print(f"Obstacles:  {obs/grand_total*100:.2f}%")
print(f"Ratio bg/obs: {bg/obs:.0f}:1")

In [None]:
# Bar chart — obstacle classes only (exclude background for readability)
obstacle_cols = [CLASS_NAMES[i] for i in range(1, NUM_CLASSES)]
obstacle_totals = totals[obstacle_cols]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Absolute counts
colors = ['#2617B4', '#B18430', '#815161', '#428409']  # Airbus class colors
axes[0].bar(obstacle_cols, obstacle_totals.values, color=colors)
axes[0].set_ylabel("Total Points")
axes[0].set_title("Total Points per Obstacle Class (all frames)")
for i, v in enumerate(obstacle_totals.values):
    axes[0].text(i, v + v*0.02, f"{v:,.0f}", ha='center', fontsize=9)

# Percentage of obstacle points
obs_total = obstacle_totals.sum()
obs_pct = obstacle_totals / obs_total * 100
axes[1].bar(obstacle_cols, obs_pct.values, color=colors)
axes[1].set_ylabel("% of Obstacle Points")
axes[1].set_title("Class Share Among Obstacles")
for i, v in enumerate(obs_pct.values):
    axes[1].text(i, v + 0.5, f"{v:.1f}%", ha='center', fontsize=9)

plt.tight_layout()
plt.show()

## 3. Per-Scene Breakdown — Which Classes Appear Where?

In [None]:
# Per-scene totals
scene_totals = stats_df.groupby("scene")[obstacle_cols].sum()

print("=" * 80)
print("PER-SCENE OBSTACLE POINT COUNTS")
print("=" * 80)
print(scene_totals.to_string())

# Which classes are present in each scene? (>0 points)
print("\n" + "=" * 80)
print("CLASS PRESENCE PER SCENE (X = present)")
print("=" * 80)
presence = (scene_totals > 0).replace({True: "X", False: "-"})
print(presence.to_string())

In [None]:
# Heatmap of class distribution per scene
fig, ax = plt.subplots(figsize=(10, 8))

# Normalize per scene (percentage of obstacle points in that scene)
scene_obs_total = scene_totals.sum(axis=1)
scene_pct = scene_totals.div(scene_obs_total, axis=0) * 100
scene_pct = scene_pct.fillna(0)

im = ax.imshow(scene_pct.values, cmap='YlOrRd', aspect='auto')
ax.set_xticks(range(len(obstacle_cols)))
ax.set_xticklabels(obstacle_cols, rotation=45, ha='right')
ax.set_yticks(range(len(scene_pct)))
ax.set_yticklabels(scene_pct.index)
ax.set_title("Class Distribution per Scene (% of obstacle points)")

# Add text annotations
for i in range(len(scene_pct)):
    for j in range(len(obstacle_cols)):
        val = scene_pct.values[i, j]
        count = scene_totals.values[i, j]
        text_color = 'white' if val > 50 else 'black'
        ax.text(j, i, f"{val:.0f}%\n({count:,.0f})", 
                ha='center', va='center', fontsize=8, color=text_color)

plt.colorbar(im, label="% of obstacle points")
plt.tight_layout()
plt.show()

## 4. Per-Frame Analysis — Frames With/Without Obstacles

In [None]:
# How many frames have each class?
stats_df["has_obstacles"] = (stats_df[obstacle_cols].sum(axis=1) > 0)

print("=" * 60)
print("FRAME-LEVEL CLASS PRESENCE")
print("=" * 60)

for cname in obstacle_cols:
    n_frames = (stats_df[cname] > 0).sum()
    pct = n_frames / len(stats_df) * 100
    avg_pts = stats_df[stats_df[cname] > 0][cname].mean() if n_frames > 0 else 0
    print(f"  {cname:15s}: present in {n_frames:>4d}/{len(stats_df)} frames ({pct:.1f}%), "
          f"avg {avg_pts:,.0f} pts when present")

n_with = stats_df["has_obstacles"].sum()
n_without = len(stats_df) - n_with
print(f"\nFrames WITH obstacles:    {n_with} ({n_with/len(stats_df)*100:.1f}%)")
print(f"Frames WITHOUT obstacles: {n_without} ({n_without/len(stats_df)*100:.1f}%)")

In [None]:
# Distribution of obstacle point counts per frame (when obstacles present)
frames_with_obs = stats_df[stats_df["has_obstacles"]]
frames_with_obs["total_obstacle_pts"] = frames_with_obs[obstacle_cols].sum(axis=1)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of total obstacle points per frame
axes[0].hist(frames_with_obs["total_obstacle_pts"], bins=50, color="steelblue", alpha=0.7)
axes[0].set_xlabel("Obstacle Points per Frame")
axes[0].set_ylabel("Number of Frames")
axes[0].set_title("Distribution of Obstacle Points per Frame")
axes[0].axvline(frames_with_obs["total_obstacle_pts"].median(), 
                color='red', linestyle='--', label=f"Median: {frames_with_obs['total_obstacle_pts'].median():.0f}")
axes[0].legend()

# Per-class box plot
box_data = []
box_labels = []
for cname in obstacle_cols:
    vals = stats_df[stats_df[cname] > 0][cname].values
    if len(vals) > 0:
        box_data.append(vals)
        box_labels.append(cname)

if box_data:
    axes[1].boxplot(box_data, labels=box_labels)
    axes[1].set_ylabel("Points per Frame")
    axes[1].set_title("Points per Frame by Class (when present)")

plt.tight_layout()
plt.show()

## 5. Scene Diversity Analysis — Select Validation Scene

In [None]:
# Score each scene by diversity:
# - Number of distinct classes present
# - Total obstacle points
# - Balance between classes

print("=" * 60)
print("SCENE DIVERSITY ANALYSIS")
print("=" * 60)

diversity_scores = []
for scene in scene_totals.index:
    row = scene_totals.loc[scene]
    n_classes = (row > 0).sum()
    total_obs = row.sum()
    
    # Shannon entropy as balance measure (higher = more balanced)
    if total_obs > 0:
        probs = row[row > 0].values / total_obs
        entropy = -np.sum(probs * np.log(probs + 1e-10))
    else:
        entropy = 0
    
    diversity_scores.append({
        "scene": scene,
        "n_classes": n_classes,
        "total_obstacle_pts": total_obs,
        "entropy": entropy,
        "diversity_score": n_classes * entropy,  # composite score
    })

div_df = pd.DataFrame(diversity_scores).sort_values("diversity_score", ascending=False)
print(div_df.to_string(index=False))

best_val_scene = div_df.iloc[0]["scene"]
print(f"\n>>> RECOMMENDED VALIDATION SCENE: {best_val_scene} <<<")
print(f"    (highest diversity score = {div_df.iloc[0]['diversity_score']:.3f})")

## 6. Calibrate Loss Weights

Compute inverse frequency weights from the actual data.

In [None]:
# Compute actual class frequencies
total_per_class = stats_df[class_cols].sum()
total_all = total_per_class.sum()

print("=" * 60)
print("LOSS WEIGHT CALIBRATION")
print("=" * 60)

# Inverse frequency weighting
# weight_i = total_all / (NUM_CLASSES * count_i)
# Then normalize so background = 0.1 (we don't want to over-penalize bg errors)

raw_weights = {}
for i, cname in enumerate(class_cols):
    count = total_per_class[cname]
    if count > 0:
        raw_weights[cname] = total_all / (NUM_CLASSES * count)
    else:
        raw_weights[cname] = 1.0  # default if class never appears

# Normalize: set background to a fixed low value
bg_weight = 0.1
scale_factor = bg_weight / raw_weights["background"]

calibrated_weights = {}
for cname in class_cols:
    calibrated_weights[cname] = raw_weights[cname] * scale_factor

print("\nRaw inverse-frequency weights:")
for cname in class_cols:
    freq = total_per_class[cname] / total_all * 100
    print(f"  {cname:15s}: freq={freq:>7.3f}%, raw_weight={raw_weights[cname]:.4f}")

print("\nCalibrated weights (background pinned to 0.1):")
weight_list = []
for cname in class_cols:
    w = calibrated_weights[cname]
    weight_list.append(round(w, 2))
    print(f"  {cname:15s}: {w:.2f}")

print(f"\n>>> COPY THIS TO config.py: <<<")
print(f'"class_weights": {weight_list},  # [bg, antenna, cable, pole, turbine]')

In [None]:
# Visualize the weights
fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(class_cols, weight_list, 
              color=['gray', '#2617B4', '#B18430', '#815161', '#428409'])
ax.set_ylabel("Loss Weight")
ax.set_title("Calibrated Class Weights for Cross-Entropy Loss")
for bar, w in zip(bars, weight_list):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
            f"{w:.2f}", ha='center', fontsize=10)
plt.tight_layout()
plt.show()

## 7. Summary & Config Updates

In [None]:
print("=" * 60)
print("STORY 1.2 — SUMMARY")
print("=" * 60)

print(f"\nDataset: {len(stats_df)} frames across {stats_df['scene'].nunique()} scenes")
print(f"Total points: {total_all:,.0f}")
print(f"Background ratio: {total_per_class['background']/total_all*100:.2f}%")

print(f"\nObstacle class summary:")
for cname in obstacle_cols:
    n_frames = (stats_df[cname] > 0).sum()
    total = total_per_class[cname]
    print(f"  {cname:15s}: {total:>10,} pts across {n_frames:>4d} frames")

print(f"\n--- CONFIG UPDATES NEEDED ---")
print(f'val_scene: "{best_val_scene}"')
print(f'class_weights: {weight_list}')
print(f"\nUpdate these values in src/config.py before training!")

print(f"\n--- VALIDATION CHECKLIST ---")
checks = [
    ("Per-class point count table computed", True),
    ("Per-scene breakdown with class presence", True),
    ("Background vs labeled ratio computed", True),
    ("Loss weights calibrated from data", True),
    (f"Validation scene selected: {best_val_scene}", True),
]
for desc, passed in checks:
    print(f"  [{'PASS' if passed else 'FAIL'}] {desc}")

print(f"\nStory 1.2 COMPLETE. Next: Story 1.3 — GT Bounding Box Reconstruction")

In [None]:
# Save the stats DataFrame for later use
output_path = os.path.join(DRIVE_BASE, "outputs", "frame_stats.csv")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
stats_df.to_csv(output_path, index=False)
print(f"Frame stats saved to {output_path}")