In [None]:
from pathlib import Path
import json

# Project root: cwd when run from repo root, or two levels up from data-pipeline/notebooks
cwd = Path.cwd()
project_root = cwd if (cwd / "package.json").exists() else cwd.parent.parent
raw_dir = project_root / "raw-data"
prepared_dir = project_root / "prepared-data"

print("Project root:", project_root)
print("Raw data:", raw_dir)
print("Prepared data:", prepared_dir)

In [None]:
# Load all raw trip files (raw-data/YYYY/MM.json). Each file is a JSON array of trip objects.
# Each trip has: started_at, ended_at, duration (seconds), start/end station fields.

records = []  # list of (year, month, trip)
for year_dir in sorted(raw_dir.iterdir()):
    if not year_dir.is_dir():
        continue
    year = int(year_dir.name)
    for json_path in sorted(year_dir.glob("*.json")):
        month = int(json_path.stem)
        with open(json_path, encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            trips = data
        else:
            trips = data.get("data", data.get("trips", []))
        for t in trips:
            records.append((year, month, t))

print(f"Loaded {len(records)} trips from {len({ (y, m) for y, m, _ in records })} month(s).")
if records:
    _, _, sample = records[0]
    print("Sample trip keys:", list(sample.keys()))
    print("Sample trip (first record):", sample)

In [None]:
# Average trip duration (seconds) per month. Trip field: "duration" (seconds).
from collections import defaultdict

by_month = defaultdict(list)  # (year, month) -> [durations]
for year, month, trip in records:
    d = trip.get("duration")
    if d is not None:
        by_month[(year, month)].append(d)

avg_by_month = [
    {"year": year, "month": month, "avg_trip_seconds": sum(durs) / len(durs), "trip_count": len(durs)}
    for (year, month), durs in sorted(by_month.items())
]

print("Average trip time by month (first rows):")
for row in avg_by_month[:10]:
    print(row)

In [None]:
# Write monthly averages to prepared-data/
prepared_dir.mkdir(parents=True, exist_ok=True)
out_path = prepared_dir / "avg_trip_time_by_month.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(avg_by_month, f, indent=2)

print(f"Wrote {len(avg_by_month)} months to {out_path}")