# Stations Preparation: Oslo Bysykkel

Extracts unique stations with lat/lon and per-station summaries (trips as origin, trips as destination, total trips).
Output: `prepared-data/stations.json`

## Setup & Imports

In [1]:
# =============================================================================
# SETUP: Project paths and execution utils
# =============================================================================
from pathlib import Path
import json
import sys

import pandas as pd

cwd = Path.cwd()
project_root = cwd if (cwd / "package.json").exists() else cwd.parent.parent
raw_dir = project_root / "raw-data"
prepared_dir = project_root / "prepared-data"

sys.path.insert(0, str(project_root / "data-pipeline"))
from execution_utils import show_execution_banner, write_with_execution_metadata

print("Project root:", project_root)
print("Raw data:", raw_dir)
print("Prepared data:", prepared_dir)

out_path = prepared_dir / "stations.json"
_pipeline_start_time = show_execution_banner(out_path)

Project root: c:\Users\Nicol\Desktop\INF252-Course-Project
Raw data: c:\Users\Nicol\Desktop\INF252-Course-Project\raw-data
Prepared data: c:\Users\Nicol\Desktop\INF252-Course-Project\prepared-data
No previous execution info (file does not exist yet).


## Load Data

In [2]:
# =============================================================================
# Load trip data from raw-data/YYYY/MM.json
# =============================================================================
records = []
for year_dir in sorted(raw_dir.iterdir()):
    if not year_dir.is_dir():
        continue
    year = int(year_dir.name)
    for json_path in sorted(year_dir.glob("*.json")):
        month = int(json_path.stem)
        with open(json_path, encoding="utf-8") as f:
            data = json.load(f)
        trips = data if isinstance(data, list) else data.get("data", data.get("trips", []))
        for t in trips:
            records.append((year, month, t))

rows = []
for year, month, t in records:
    rows.append({
        "year": year,
        "month": month,
        "start_station_id": str(t.get("start_station_id")),
        "start_station_name": t.get("start_station_name"),
        "end_station_id": str(t.get("end_station_id")),
        "end_station_name": t.get("end_station_name"),
        "start_lat": t.get("start_station_latitude"),
        "start_lon": t.get("start_station_longitude"),
        "end_lat": t.get("end_station_latitude"),
        "end_lon": t.get("end_station_longitude"),
    })

df = pd.DataFrame(rows)
df = df.dropna(subset=["start_lat", "start_lon", "end_lat", "end_lon"])

print(f"Loaded {len(df)} trips")

Loaded 10034294 trips


## Build Station Catalog with Summaries

In [3]:
# =============================================================================
# Build station catalog: id, name, lat, lon, trips_as_origin, trips_as_dest, total_trips
# =============================================================================
stations_start = df.groupby("start_station_id").agg({
    "start_station_name": "first",
    "start_lat": "first",
    "start_lon": "first",
}).rename(columns={"start_station_name": "name", "start_lat": "lat", "start_lon": "lon"})
stations_start["trips_as_origin"] = df.groupby("start_station_id").size()

stations_end = df.groupby("end_station_id").agg({
    "end_station_name": "first",
    "end_lat": "first",
    "end_lon": "first",
}).rename(columns={"end_station_name": "name", "end_lat": "lat", "end_lon": "lon"})
stations_end["trips_as_dest"] = df.groupby("end_station_id").size()

all_ids = set(stations_start.index) | set(stations_end.index)
stations = []
for sid in sorted(all_ids, key=lambda x: (len(str(x)), x)):
    trips_origin = int(stations_start.loc[sid]["trips_as_origin"]) if sid in stations_start.index else 0
    trips_dest = int(stations_end.loc[sid]["trips_as_dest"]) if sid in stations_end.index else 0
    if sid in stations_start.index:
        row = stations_start.loc[sid]
    else:
        row = stations_end.loc[sid]
    stations.append({
        "id": sid,
        "name": str(row["name"]),
        "lat": float(row["lat"]),
        "lon": float(row["lon"]),
        "trips_as_origin": trips_origin,
        "trips_as_dest": trips_dest,
        "total_trips": trips_origin + trips_dest,
    })

print(f"Stations: {len(stations)}")

Stations: 292


In [4]:
# =============================================================================
# Write output
# =============================================================================
write_with_execution_metadata(out_path, {"stations": stations}, _pipeline_start_time)
print(f"Wrote {out_path}")

Wrote c:\Users\Nicol\Desktop\INF252-Course-Project\prepared-data\stations.json
