In [1]:
from pathlib import Path
import json
import sys

# Project root: cwd when run from repo root, or two levels up from data-pipeline/notebooks
cwd = Path.cwd()
project_root = cwd if (cwd / "package.json").exists() else cwd.parent.parent
raw_dir = project_root / "raw-data"
prepared_dir = project_root / "prepared-data"

sys.path.insert(0, str(project_root / "data-pipeline"))
from execution_utils import show_execution_banner, write_with_execution_metadata

print("Project root:", project_root)
print("Raw data:", raw_dir)
print("Prepared data:", prepared_dir)

out_path = prepared_dir / "avg_trip_time_by_month.json"
_pipeline_start_time = show_execution_banner(out_path)

Project root: c:\Users\Nicol\Desktop\INF252-Course-Project
Raw data: c:\Users\Nicol\Desktop\INF252-Course-Project\raw-data
Prepared data: c:\Users\Nicol\Desktop\INF252-Course-Project\prepared-data
No previous execution info.


In [2]:
# Load all raw trip files (raw-data/YYYY/MM.json). Each file is a JSON array of trip objects.
# Each trip has: started_at, ended_at, duration (seconds), start/end station fields.

records = []  # list of (year, month, trip)
for year_dir in sorted(raw_dir.iterdir()):
    if not year_dir.is_dir():
        continue
    year = int(year_dir.name)
    for json_path in sorted(year_dir.glob("*.json")):
        month = int(json_path.stem)
        with open(json_path, encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            trips = data
        else:
            trips = data.get("data", data.get("trips", []))
        for t in trips:
            records.append((year, month, t))

print(f"Loaded {len(records)} trips from {len({ (y, m) for y, m, _ in records })} month(s).")
if records:
    _, _, sample = records[0]
    print("Sample trip keys:", list(sample.keys()))
    print("Sample trip (first record):", sample)

Loaded 10034294 trips from 80 month(s).
Sample trip keys: ['started_at', 'ended_at', 'duration', 'start_station_id', 'start_station_name', 'start_station_description', 'start_station_latitude', 'start_station_longitude', 'end_station_id', 'end_station_name', 'end_station_description', 'end_station_latitude', 'end_station_longitude']
Sample trip (first record): {'started_at': '2019-04-02 22:18:47.926000+00:00', 'ended_at': '2019-04-02 22:24:25.427000+00:00', 'duration': 337, 'start_station_id': '401', 'start_station_name': 'Schous plass', 'start_station_description': 'nærmest Thorvald Meyers gate', 'start_station_latitude': 59.920259, 'start_station_longitude': 10.760629, 'end_station_id': '381', 'end_station_name': 'Grønlands torg', 'end_station_description': 'ved Tøyenbekken', 'end_station_latitude': 59.91252, 'end_station_longitude': 10.76224}


In [3]:
# Average trip duration (seconds) per month. Trip field: "duration" (seconds).
from collections import defaultdict

by_month = defaultdict(list)  # (year, month) -> [durations]
for year, month, trip in records:
    d = trip.get("duration")
    if d is not None:
        by_month[(year, month)].append(d)

avg_by_month = [
    {"year": year, "month": month, "avg_trip_seconds": sum(durs) / len(durs), "trip_count": len(durs)}
    for (year, month), durs in sorted(by_month.items())
]

print("Average trip time by month (first rows):")
for row in avg_by_month[:10]:
    print(row)

Average trip time by month (first rows):
{'year': 2019, 'month': 4, 'avg_trip_seconds': 905.5163070195102, 'trip_count': 254277}
{'year': 2019, 'month': 5, 'avg_trip_seconds': 790.2466757289264, 'trip_count': 327365}
{'year': 2019, 'month': 6, 'avg_trip_seconds': 868.9450153002736, 'trip_count': 332347}
{'year': 2019, 'month': 7, 'avg_trip_seconds': 880.4446442919834, 'trip_count': 323580}
{'year': 2019, 'month': 8, 'avg_trip_seconds': 813.3075522815592, 'trip_count': 399376}
{'year': 2019, 'month': 9, 'avg_trip_seconds': 721.4497225187121, 'trip_count': 300741}
{'year': 2019, 'month': 10, 'avg_trip_seconds': 622.445439038824, 'trip_count': 201087}
{'year': 2019, 'month': 11, 'avg_trip_seconds': 607.6722377167781, 'trip_count': 95374}
{'year': 2019, 'month': 12, 'avg_trip_seconds': 549.1465765765765, 'trip_count': 11100}
{'year': 2020, 'month': 3, 'avg_trip_seconds': 1037.4355444305381, 'trip_count': 29563}


In [4]:
# Write monthly averages to prepared-data/
write_with_execution_metadata(out_path, avg_by_month, _pipeline_start_time)
print(f"Wrote {len(avg_by_month)} months to {out_path}")

Wrote 80 months to c:\Users\Nicol\Desktop\INF252-Course-Project\prepared-data\avg_trip_time_by_month.json
