# Data Ingest and Benchmark (Course 3)

This notebook demonstrates GPU-accelerated telemetry loading with cuDF and Unified Virtual Memory (UVM), and benchmarks pandas vs cuDF. Inspired by [Speed Up Data Analytics on GPUs](https://developers.google.com/learn/pathways/speed-up-data-analytics-GPUs).

## Generate synthetic telemetry

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path("..").resolve()))

from data.synthetic.generate_telemetry import generate_telemetry, _ensure_full_schema

df = generate_telemetry(n_rows=500_000, vehicle_count=5, duration_hours=24)
df = _ensure_full_schema(df)
out_dir = Path("../data/synthetic")
out_dir.mkdir(parents=True, exist_ok=True)
parquet_path = out_dir / "fleet_telemetry.parquet"
df.to_parquet(parquet_path, index=False)
print(f"Generated {len(df)} rows -> {parquet_path}")

## Load with cuDF and UVM spill (Course 3 pattern)

In [None]:
from src.ingest.cudf_loader import load_telemetry

df = load_telemetry(str(parquet_path), spill=True, use_cudf=True)
print(df.head())
print(f"Shape: {df.shape}")

## Benchmark: pandas vs cuDF

In [None]:
from src.ingest.benchmark_loader import run_benchmark, benchmark_to_dataframe

results = run_benchmark(str(parquet_path), operations=["load", "groupby", "filter", "sort"])
bm_df = benchmark_to_dataframe(results)
print(bm_df)

In [None]:
import matplotlib.pyplot as plt

pivot_time = bm_df.pivot(index="operation", columns="backend", values="time_s")
pivot_mem = bm_df.pivot(index="operation", columns="backend", values="memory_mb")
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
pivot_time.plot(kind="bar", ax=ax[0], rot=45)
ax[0].set_ylabel("Time (s)")
pivot_mem.plot(kind="bar", ax=ax[1], rot=45)
ax[1].set_ylabel("Memory (MB)")
plt.tight_layout()
plt.show()