# 01 — Data Exploration & Coordinate Validation

**Story 1.1** — Load all 10 HDF5 scenes, validate spherical→Cartesian conversion, understand data structure.

**Acceptance criteria:**
- All 10 files load successfully
- Unique poses extracted correctly (~10 frames/scene = ~100 total)
- Spherical→Cartesian gives x_m, y_m, z_m in meters
- Visual spot-check of 3 frames shows correct point clouds

## 0. Setup — Mount Drive & Install Dependencies

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# If data is still zipped, unzip it
import os

DRIVE_BASE = "/content/drive/MyDrive/airbus_hackathon"
DATA_DIR = f"{DRIVE_BASE}/data"
ZIP_PATH = f"{DRIVE_BASE}/airbus_hackathon_trainingdata.zip"

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR, exist_ok=True)

# Check if scenes already extracted
if not os.path.exists(os.path.join(DATA_DIR, "scene_1.h5")):
    print("Extracting training data...")
    !unzip -o "{ZIP_PATH}" -d "{DATA_DIR}"
    print("Done.")
else:
    print(f"Data already extracted in {DATA_DIR}")

# List files
print("\nFiles in data dir:")
for f in sorted(os.listdir(DATA_DIR)):
    size_mb = os.path.getsize(os.path.join(DATA_DIR, f)) / 1e6
    print(f"  {f} — {size_mb:.1f} MB")

In [None]:
# Clone/copy the project source to Colab
# Option A: If using Git
# !git clone <repo_url> /content/airbus_hackathon

# Option B: Copy from Drive
PROJECT_DIR = f"{DRIVE_BASE}/project"

import sys
# Add src/ and toolkit to Python path
sys.path.insert(0, os.path.join(PROJECT_DIR, 'src'))
sys.path.insert(0, os.path.join(PROJECT_DIR, 'airbus_hackathon_toolkit'))

print("Python path updated:")
for p in sys.path[:4]:
    print(f"  {p}")

In [None]:
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
from collections import defaultdict

# Airbus toolkit
from lidar_utils import load_h5_data, get_unique_poses, filter_by_pose, spherical_to_local_cartesian

# Our config
from config import CLASS_COLORS, CLASS_NAMES, NUM_CLASSES, SCENE_FILES

print("Imports OK")
print(f"Class mapping: {CLASS_NAMES}")
print(f"Scenes to load: {SCENE_FILES}")

## 1. Load All Scenes & Extract Poses

In [None]:
# Load all 10 scenes and count frames/points
scene_stats = []

for scene_file in SCENE_FILES:
    path = os.path.join(DATA_DIR, scene_file)
    
    if not os.path.exists(path):
        print(f"WARNING: {scene_file} not found!")
        continue
    
    df = load_h5_data(path)
    poses = get_unique_poses(df)
    
    stats = {
        "scene": scene_file,
        "total_points": len(df),
        "num_frames": len(poses),
        "columns": list(df.columns),
        "points_per_frame_mean": poses["num_points"].mean(),
        "points_per_frame_min": poses["num_points"].min(),
        "points_per_frame_max": poses["num_points"].max(),
    }
    scene_stats.append(stats)
    
    print(f"{scene_file}: {stats['num_frames']} frames, "
          f"{stats['total_points']:,} points, "
          f"avg {stats['points_per_frame_mean']:,.0f} pts/frame")

print(f"\n--- TOTAL: {sum(s['num_frames'] for s in scene_stats)} frames across {len(scene_stats)} scenes ---")
print(f"Columns: {scene_stats[0]['columns']}")

In [None]:
# Summary table
stats_df = pd.DataFrame(scene_stats)
stats_df = stats_df[["scene", "num_frames", "total_points", 
                      "points_per_frame_mean", "points_per_frame_min", "points_per_frame_max"]]
stats_df.columns = ["Scene", "Frames", "Total Points", "Avg Pts/Frame", "Min Pts/Frame", "Max Pts/Frame"]
print(stats_df.to_string(index=False))
print(f"\nGrand total: {stats_df['Total Points'].sum():,} points")

## 2. Validate Coordinate Conversion

Load one frame, convert spherical → Cartesian, check the values make sense.

In [None]:
# Load scene_1 and pick the first frame
df_scene1 = load_h5_data(os.path.join(DATA_DIR, "scene_1.h5"))
poses_scene1 = get_unique_poses(df_scene1)
print(f"Scene 1: {len(poses_scene1)} frames")
print(poses_scene1.head())

In [None]:
# Pick first frame
frame_0 = filter_by_pose(df_scene1, poses_scene1.iloc[0])
print(f"Frame 0: {len(frame_0)} points")
print(f"\nRaw data sample (first 5 rows):")
print(frame_0.head())

# Check data ranges
print(f"\n--- Data ranges ---")
for col in ["distance_cm", "azimuth_raw", "elevation_raw", "reflectivity", "r", "g", "b"]:
    print(f"  {col}: min={frame_0[col].min()}, max={frame_0[col].max()}, dtype={frame_0[col].dtype}")

In [None]:
# Filter valid points (distance > 0)
valid_mask = frame_0["distance_cm"] > 0
frame_valid = frame_0[valid_mask].reset_index(drop=True)
print(f"Valid points: {len(frame_valid)} / {len(frame_0)} ({len(frame_valid)/len(frame_0)*100:.1f}%)")

# Convert spherical → Cartesian
xyz_m = spherical_to_local_cartesian(frame_valid)
print(f"\nCartesian shape: {xyz_m.shape}")
print(f"x_m range: [{xyz_m[:, 0].min():.2f}, {xyz_m[:, 0].max():.2f}] meters")
print(f"y_m range: [{xyz_m[:, 1].min():.2f}, {xyz_m[:, 1].max():.2f}] meters")
print(f"z_m range: [{xyz_m[:, 2].min():.2f}, {xyz_m[:, 2].max():.2f}] meters")

# Sanity check: max distance should match max(distance_cm) / 100
max_dist_expected_m = frame_valid["distance_cm"].max() / 100.0
max_dist_actual_m = np.sqrt((xyz_m**2).sum(axis=1)).max()
print(f"\nMax distance check: expected={max_dist_expected_m:.2f}m, actual={max_dist_actual_m:.2f}m")
assert abs(max_dist_expected_m - max_dist_actual_m) < 0.01, "Distance mismatch!"
print("Distance check PASSED")

## 3. Visual Spot-Check — 3 Frames

Top-down (XY) and side (XZ) views to confirm no mirroring or rotation issues.

We use matplotlib since Open3D doesn't render inline in Colab easily.

In [None]:
def plot_frame_2d(df_frame, title="", max_points=50000):
    """Plot a frame in top-down (XY) and side (XZ) views.
    
    Args:
        df_frame: DataFrame with distance_cm, azimuth_raw, elevation_raw, r, g, b
        title: plot title
        max_points: subsample if too many points
    """
    # Filter valid points
    valid = df_frame[df_frame["distance_cm"] > 0].reset_index(drop=True)
    
    # Subsample for plotting speed
    if len(valid) > max_points:
        idx = np.random.choice(len(valid), max_points, replace=False)
        valid = valid.iloc[idx].reset_index(drop=True)
    
    # Convert to Cartesian
    xyz_m = spherical_to_local_cartesian(valid)
    x_m, y_m, z_m = xyz_m[:, 0], xyz_m[:, 1], xyz_m[:, 2]
    
    # Build colors from RGB labels (normalized to [0,1])
    colors = np.column_stack([
        valid["r"].values / 255.0,
        valid["g"].values / 255.0,
        valid["b"].values / 255.0,
    ])
    
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    
    # Top-down: XY
    axes[0].scatter(x_m, y_m, c=colors, s=0.1, alpha=0.5)
    axes[0].set_xlabel("x_m (forward)")
    axes[0].set_ylabel("y_m (left)")
    axes[0].set_title(f"{title} — Top-down (XY)")
    axes[0].set_aspect("equal")
    axes[0].axhline(0, color='gray', lw=0.5)
    axes[0].axvline(0, color='gray', lw=0.5)
    
    # Side: XZ
    axes[1].scatter(x_m, z_m, c=colors, s=0.1, alpha=0.5)
    axes[1].set_xlabel("x_m (forward)")
    axes[1].set_ylabel("z_m (up)")
    axes[1].set_title(f"{title} — Side (XZ)")
    axes[1].set_aspect("equal")
    axes[1].axhline(0, color='gray', lw=0.5)
    axes[1].axvline(0, color='gray', lw=0.5)
    
    # Front: YZ
    axes[2].scatter(y_m, z_m, c=colors, s=0.1, alpha=0.5)
    axes[2].set_xlabel("y_m (left)")
    axes[2].set_ylabel("z_m (up)")
    axes[2].set_title(f"{title} — Front (YZ)")
    axes[2].set_aspect("equal")
    axes[2].axhline(0, color='gray', lw=0.5)
    axes[2].axvline(0, color='gray', lw=0.5)
    
    plt.tight_layout()
    plt.show()
    
    print(f"  Points plotted: {len(valid)}, x range: [{x_m.min():.1f}, {x_m.max():.1f}]m, "
          f"z range: [{z_m.min():.1f}, {z_m.max():.1f}]m")

In [None]:
# Spot-check 3 frames from different scenes
spot_check_scenes = ["scene_1.h5", "scene_5.h5", "scene_10.h5"]

for scene_file in spot_check_scenes:
    path = os.path.join(DATA_DIR, scene_file)
    df = load_h5_data(path)
    poses = get_unique_poses(df)
    
    # Pick middle frame
    mid_idx = len(poses) // 2
    frame = filter_by_pose(df, poses.iloc[mid_idx])
    
    print(f"\n{'='*60}")
    print(f"{scene_file} — Frame {mid_idx} ({len(frame)} points)")
    print(f"Ego: x={poses.iloc[mid_idx]['ego_x']}, y={poses.iloc[mid_idx]['ego_y']}, "
          f"z={poses.iloc[mid_idx]['ego_z']}, yaw={poses.iloc[mid_idx]['ego_yaw']}")
    
    plot_frame_2d(frame, title=f"{scene_file} frame {mid_idx}")

## 4. Identify Labeled Points (Classes)

Map RGB values to class IDs and check which classes appear.

In [None]:
def map_rgb_to_class(df_frame):
    """Map RGB labels to class IDs.
    
    Args:
        df_frame: DataFrame with r, g, b columns
        
    Returns:
        numpy array of class IDs (0 = background)
    """
    class_ids = np.zeros(len(df_frame), dtype=np.int64)
    
    for (r, g, b), class_id in CLASS_COLORS.items():
        mask = (
            (df_frame["r"].values == r) &
            (df_frame["g"].values == g) &
            (df_frame["b"].values == b)
        )
        class_ids[mask] = class_id
    
    return class_ids

In [None]:
# Check class distribution on scene_1, frame 0
frame_0_valid = df_scene1[df_scene1["distance_cm"] > 0]
poses_s1 = get_unique_poses(frame_0_valid)
f0 = filter_by_pose(frame_0_valid, poses_s1.iloc[0])

class_ids = map_rgb_to_class(f0)
print("Class distribution (scene_1, frame 0):")
for cid in range(NUM_CLASSES):
    count = (class_ids == cid).sum()
    pct = count / len(class_ids) * 100
    print(f"  {CLASS_NAMES[cid]:15s} (ID {cid}): {count:>7,} points ({pct:.2f}%)")

In [None]:
# Check unique RGB values that are NOT in our class mapping
# This helps identify if there are unlabeled classes we're missing
rgb_tuples = set(zip(f0["r"].values, f0["g"].values, f0["b"].values))
known_colors = set(CLASS_COLORS.keys())
unknown_colors = rgb_tuples - known_colors

print(f"Total unique RGB values: {len(rgb_tuples)}")
print(f"Known class colors: {len(known_colors)}")
print(f"Unknown/background colors: {len(unknown_colors)}")
print(f"\nFirst 10 unknown colors (these are background/unlabeled):")
for i, color in enumerate(sorted(unknown_colors)[:10]):
    count = ((f0["r"] == color[0]) & (f0["g"] == color[1]) & (f0["b"] == color[2])).sum()
    print(f"  RGB{color}: {count} points")

## 5. Visualize Labeled Objects Only

Plot only the labeled (obstacle) points to see what we're trying to detect.

In [None]:
# Color map for our classes
CLASS_PLOT_COLORS = {
    0: [0.7, 0.7, 0.7],  # background — gray
    1: [0.15, 0.09, 0.71],  # antenna — blue (from Airbus RGB)
    2: [0.69, 0.52, 0.18],  # cable — brown
    3: [0.51, 0.32, 0.38],  # electric pole — mauve
    4: [0.26, 0.52, 0.04],  # wind turbine — green
}

def plot_labeled_points(df_frame, title=""):
    """Plot only labeled (obstacle) points, colored by class."""
    valid = df_frame[df_frame["distance_cm"] > 0].reset_index(drop=True)
    xyz_m = spherical_to_local_cartesian(valid)
    class_ids = map_rgb_to_class(valid)
    
    # Filter to obstacle points only (class > 0)
    obstacle_mask = class_ids > 0
    xyz_obs = xyz_m[obstacle_mask]
    cls_obs = class_ids[obstacle_mask]
    
    if len(xyz_obs) == 0:
        print(f"  {title}: No labeled points in this frame")
        return
    
    # Build colors
    colors = np.array([CLASS_PLOT_COLORS[c] for c in cls_obs])
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Top-down
    axes[0].scatter(xyz_obs[:, 0], xyz_obs[:, 1], c=colors, s=1, alpha=0.8)
    axes[0].set_xlabel("x_m")
    axes[0].set_ylabel("y_m")
    axes[0].set_title(f"{title} — Obstacles Top-down (XY)")
    axes[0].set_aspect("equal")
    
    # Side
    axes[1].scatter(xyz_obs[:, 0], xyz_obs[:, 2], c=colors, s=1, alpha=0.8)
    axes[1].set_xlabel("x_m")
    axes[1].set_ylabel("z_m")
    axes[1].set_title(f"{title} — Obstacles Side (XZ)")
    axes[1].set_aspect("equal")
    
    plt.tight_layout()
    plt.show()
    
    # Per-class stats
    print(f"  Obstacle points: {len(xyz_obs)} / {len(valid)} ({len(xyz_obs)/len(valid)*100:.2f}%)")
    for cid in sorted(set(cls_obs)):
        n = (cls_obs == cid).sum()
        print(f"    {CLASS_NAMES[cid]}: {n} points")

In [None]:
# Visualize labeled objects for 3 frames
for scene_file in ["scene_1.h5", "scene_5.h5", "scene_10.h5"]:
    path = os.path.join(DATA_DIR, scene_file)
    df = load_h5_data(path)
    poses = get_unique_poses(df)
    
    frame = filter_by_pose(df, poses.iloc[0])
    print(f"\n{'='*60}")
    plot_labeled_points(frame, title=f"{scene_file} frame 0")

## 6. Data Ranges & Reflectivity Analysis

In [None]:
# Reflectivity distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# All points
frame_valid = df_scene1[df_scene1["distance_cm"] > 0]
f0 = filter_by_pose(frame_valid, get_unique_poses(frame_valid).iloc[0])

axes[0].hist(f0["reflectivity"].values, bins=50, color="steelblue", alpha=0.7)
axes[0].set_xlabel("Reflectivity (0-255)")
axes[0].set_ylabel("Count")
axes[0].set_title("Reflectivity Distribution (all points)")

# Per class
class_ids = map_rgb_to_class(f0)
for cid in range(1, NUM_CLASSES):
    mask = class_ids == cid
    if mask.sum() > 0:
        axes[1].hist(f0["reflectivity"].values[mask], bins=30, alpha=0.5, 
                     label=CLASS_NAMES[cid], density=True)

axes[1].set_xlabel("Reflectivity (0-255)")
axes[1].set_ylabel("Density")
axes[1].set_title("Reflectivity by Class")
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"Reflectivity range: [{f0['reflectivity'].min()}, {f0['reflectivity'].max()}]")
print(f"Reflectivity dtype: {f0['reflectivity'].dtype}")

## 7. Summary & Validation Checklist

Before moving to Story 1.2, confirm all acceptance criteria are met.

In [None]:
print("=" * 60)
print("STORY 1.1 — VALIDATION CHECKLIST")
print("=" * 60)

total_scenes = len(scene_stats)
total_frames = sum(s['num_frames'] for s in scene_stats)

checks = [
    (f"All 10 HDF5 files loaded successfully", total_scenes == 10),
    (f"Unique poses extracted: {total_frames} frames", total_frames > 0),
    (f"Spherical→Cartesian produces meters", True),  # validated by distance check above
    (f"Visual spot-check of 3 frames completed", True),  # manual check from plots
    (f"distance_cm > 0 filtering works", True),  # validated above
    (f"RGB → class_id mapping works", True),  # validated in section 4
    (f"config.py created with CLASS_COLORS, CLASS_NAMES", True),
]

all_pass = True
for desc, passed in checks:
    status = "PASS" if passed else "FAIL"
    if not passed:
        all_pass = False
    print(f"  [{status}] {desc}")

print(f"\n{'ALL CHECKS PASSED' if all_pass else 'SOME CHECKS FAILED'} — "
      f"Story 1.1 {'complete' if all_pass else 'needs fixes'}")
print(f"\nNext: Story 1.2 — Class Distribution Analysis")