# 01_M101_Findings.ipynb
Reproduces M101 findings: route-wide, AM/PM rush, hotspots, 10th percentile, crawl-share.

In [None]:
import pandas as pd
import numpy as np
import re
from IPython.display import display

In [None]:
# Load dataset
DATA_PATH = "data_work/hunter-speeds-ace-labeled.parquet"  # <- update path if needed
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Parse timestamps and create flags
df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
df["Is Weekday"] = df["Timestamp"].dt.dayofweek.isin([0, 1, 2, 3, 4])
df["Corridor"] = df["Timepoint Stop Name"] + " → " + df["Next Timepoint Stop Name"]

In [None]:
# Phase windows
ACE_ANNOUNCE = pd.Timestamp("2024-06-17")
ACE_FINE_START = pd.Timestamp("2024-08-16")  # ~60 days later
CBD_START = pd.Timestamp("2025-01-05")

In [None]:
def phase_label(ts):
    if ts < ACE_ANNOUNCE:
        return "Pre-ACE"
    elif ACE_ANNOUNCE <= ts < ACE_FINE_START:
        return "ACE Warning (skip)"
    elif ACE_FINE_START <= ts < CBD_START:
        return "ACE only"
    else:
        return "ACE + CBD"

df["Phase"] = df["Timestamp"].apply(phase_label)
df["Phase"].value_counts(dropna=False)


In [None]:
# Weighted helpers
def wavg(x, w): 
    x, w = np.asarray(x), np.asarray(w)
    return np.average(x, weights=w) if len(x) > 0 else np.nan

def wpercentile(x, w, q):
    x, w = np.asarray(x), np.asarray(w)
    if len(x) == 0:
        return np.nan
    idx = np.argsort(x)
    x, w = x[idx], w[idx]
    cumw = np.cumsum(w)
    cutoff = q * w.sum()
    k = np.searchsorted(cumw, cutoff, side="left")
    k = int(min(max(k, 0), len(x) - 1))
    return float(x[k])


In [None]:
# ROUTE-WIDE STATS
m101 = df[(df["Route ID"] == "M101") & df["Is Weekday"] & (df["Phase"] != "ACE Warning (skip)")]
allday = list(range(6, 22))

sub = m101[m101["Hour of Day"].isin(allday)]
summary = []
for p, gg in sub.groupby("Phase"):
    summary.append({
        "Phase": p,
        "Avg mph": wavg(gg["Average Road Speed"], gg["Bus Trip Count"]),
        "P10 mph": wpercentile(gg["Average Road Speed"], gg["Bus Trip Count"], 0.10),
        "Crawl <5 share": wavg((gg["Average Road Speed"] < 5).astype(float), gg["Bus Trip Count"]),
    })

routewide_df = pd.DataFrame(summary).sort_values("Phase")
display(routewide_df.round(3))


In [None]:
# AM/PM RUSH HOTSPOTS
def corridor_stats_exact(route_id, corridor_name, hours):
    sub = df[(df["Route ID"] == route_id) & df["Is Weekday"] & (df["Phase"] != "ACE Warning (skip)")]
    sub = sub[sub["Corridor"] == corridor_name]
    sub = sub[sub["Hour of Day"].isin(hours)]
    if sub.empty:
        return None
    return sub.groupby("Phase").apply(lambda g: wavg(g["Average Road Speed"], g["Bus Trip Count"])).to_dict()

am_hours = [7, 8, 9]
pm_hours = [16, 17, 18]


In [None]:
print("AM Rush:")
am_corridors = [
    "3 AV/E 99 ST → 3 AV/E 125 ST",
    "3 AV/E 23 ST → AV/ASTOR PL",
    "3 AV/E 23 ST → 3 AV/E 42 ST",
    "LEXINGTON AV/E 96 ST → LEXINGTON AV/E 68 ST",
    "LEXINGTON AV/E 100 ST → LEXINGTON AV/E 96 ST",
]

for c in am_corridors:
    print(c, corridor_stats_exact("M101", c, am_hours))


In [None]:
print("PM Rush:")
pm_corridors = [
    "LEXINGTON AV/E 68 ST → LEXINGTON AV/E 59 ST",
    "3 AV/E 23 ST → 3 AV/E 42 ST",
    "3 AV/E 23 ST → AV/ASTOR PL",
    "3 AV/E 67 ST → 3 AV/E 39 ST",
    "3 AV/E 125 ST → AMSTERDAM AV",
]

for c in pm_corridors:
    print(c, corridor_stats_exact("M101", c, pm_hours))

In [None]:
# WORST TRIPS (10th percentile)
def p10_by_corridor(route_id, corridors, hours):
    rows = []
    sub = df[(df["Route ID"] == route_id) & df["Is Weekday"] & (df["Phase"] != "ACE Warning (skip)")]
    sub = sub[sub["Hour of Day"].isin(hours)]
    for c in corridors:
        ss = sub[sub["Corridor"] == c]
        if ss.empty:
            continue
        d = {}
        for p, gg in ss.groupby("Phase"):
            d[p] = wpercentile(gg["Average Road Speed"], gg["Bus Trip Count"], 0.10)
        if "Pre-ACE" in d and "ACE only" in d:
            rows.append({
                "Corridor": c,
                "Pre P10": d["Pre-ACE"],
                "Post P10": d["ACE only"],
                "Pct Δ %": (d["ACE only"] - d["Pre-ACE"]) / d["Pre-ACE"] * 100 if d["Pre-ACE"] not in (0, np.nan) else np.nan,
            })
    return pd.DataFrame(rows)


In [None]:
print("\nPM Rush P10:")
pm_p10_df = p10_by_corridor("M101", [
    "LEXINGTON AV/E 68 ST → LEXINGTON AV/E 59 ST",
    "3 AV/E 23 ST → 3 AV/E 42 ST",
    "3 AV/E 23 ST → AV/ASTOR PL",
    "3 AV/E 125 ST → AMSTERDAM AV",
], pm_hours).round(3)

display(pm_p10_df)


In [None]:
print("\nAM Rush P10:")
am_p10_df = p10_by_corridor("M101", [
    "3 AV/E 99 ST → 3 AV/E 125 ST",
], am_hours).round(3)

display(am_p10_df)
