# 02_M101_vs_Controls.ipynb
Compare M101 (ACE) vs M102/M103 (controls): route-wide and PM-rush worst trips on key corridors.

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load dataset
DATA_PATH = "hunter-speeds-ace-labeled.csv"  # <- change if needed
df = pd.read_csv(DATA_PATH)

# Basic preprocessing
df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
df["Is Weekday"] = df["Timestamp"].dt.dayofweek.isin([0,1,2,3,4])
df["Corridor"] = df["Timepoint Stop Name"] + " → " + df["Next Timepoint Stop Name"]


In [None]:
# ACE/CBD phase windows (using 60-day warning for ACE)
ACE_ANNOUNCE = pd.Timestamp("2024-06-17")
ACE_FINE_START = pd.Timestamp("2024-08-16")  # ~60-day warning ends
CBD_START = pd.Timestamp("2025-01-05")

def phase_label(ts):
    if ts < ACE_ANNOUNCE:
        return "Pre-ACE"
    elif ACE_ANNOUNCE <= ts < ACE_FINE_START:
        return "ACE Warning (skip)"
    elif ACE_FINE_START <= ts < CBD_START:
        return "ACE only"
    else:
        return "ACE + CBD"

df["Phase"] = df["Timestamp"].apply(phase_label)
df["Phase"].value_counts(dropna=False)


In [None]:
# Weighted helpers
def wavg(x, w):
    x, w = np.asarray(x), np.asarray(w)
    return np.average(x, weights=w) if len(x) else np.nan

def wpercentile(x, w, q):
    x, w = np.asarray(x), np.asarray(w)
    if len(x) == 0: 
        return np.nan
    idx = np.argsort(x); x, w = x[idx], w[idx]
    cumw = np.cumsum(w); cutoff = q * w.sum()
    k = np.searchsorted(cumw, cutoff, side="left")
    k = int(min(max(k,0), len(x)-1))
    return float(x[k])


In [None]:
# Common filters
routes = ["M101","M102","M103"]
allday_hours = list(range(6,22))
pm_hours = [16,17,18]

base = df[df["Is Weekday"] & (df["Phase"]!="ACE Warning (skip)")].copy()


In [None]:
# 1) Route-wide (all-day) comparison: Avg mph & P10 mph
rows = []
for r in routes:
    sub = base[(base["Route ID"]==r) & (base["Hour of Day"].isin(allday_hours))]
    for p, gg in sub.groupby("Phase"):
        rows.append({
            "Route": r,
            "Phase": p,
            "Avg mph": wavg(gg["Average Road Speed"], gg["Bus Trip Count"]),
            "P10 mph": wpercentile(gg["Average Road Speed"], gg["Bus Trip Count"], 0.10),
            "Crawl<5 share": wavg((gg["Average Road Speed"]<5).astype(float), gg["Bus Trip Count"])
        })

routewide = pd.DataFrame(rows).sort_values(["Route","Phase"])
display(routewide.round(3))


In [None]:
# Optional: quick pivot to see pre vs post %Δ for route-wide averages
def pct_change_table(df_in, metric):
    piv = df_in.pivot_table(index="Route", columns="Phase", values=metric)
    if {"Pre-ACE","ACE only"}.issubset(piv.columns):
        piv["%Δ (ACE vs Pre)"] = (piv["ACE only"] - piv["Pre-ACE"]) / piv["Pre-ACE"] * 100
    if {"ACE only","ACE + CBD"}.issubset(piv.columns):
        piv["%Δ (CBD vs ACE)"] = (piv["ACE + CBD"] - piv["ACE only"]) / piv["ACE only"] * 100
    return piv


In [None]:
print("\nRoute-wide % changes — Avg mph")
display(pct_change_table(routewide, "Avg mph").round(2))

print("\nRoute-wide % changes — P10 mph")
display(pct_change_table(routewide, "P10 mph").round(2))

print("\nRoute-wide % changes — Crawl<5 share")
display(pct_change_table(routewide, "Crawl<5 share").round(3))


In [None]:
# 2) PM-rush “worst trips” (10th percentile) on key corridors
def p10_table(route_list, corridor, hours):
    out = []
    sub = base[(base["Corridor"]==corridor) & (base["Hour of Day"].isin(hours))]
    for r in route_list:
        ss = sub[sub["Route ID"]==r]
        if ss.empty: 
            continue
        phases = {}
        for p, gg in ss.groupby("Phase"):
            phases[p] = wpercentile(gg["Average Road Speed"], gg["Bus Trip Count"], 0.10)
        if "Pre-ACE" in phases and "ACE only" in phases:
            out.append({
                "Route": r,
                "Corridor": corridor,
                "Pre P10 mph": phases["Pre-ACE"],
                "Post P10 mph (ACE only)": phases["ACE only"],
                "%Δ (ACE vs Pre)": (phases["ACE only"]-phases["Pre-ACE"])/phases["Pre-ACE"]*100
            })
    return pd.DataFrame(out).sort_values("Route")


In [None]:
print("\nPM Rush (4–6 PM) — Hunter stop (Lex 68 → 59 SB)")
hunter_p10 = p10_table(routes, "LEXINGTON AV/E 68 ST → LEXINGTON AV/E 59 ST", pm_hours)
display(hunter_p10.round(3))

print("\nPM Rush (4–6 PM) — Midtown South (3 AV 23 → 42 NB)")
midtown_p10 = p10_table(["M101","M102"], "3 AV/E 23 ST → 3 AV/E 42 ST", pm_hours)
display(midtown_p10.round(3))


In [None]:
# 3) (Optional) Quick “choke point” averages for the same corridors
def mean_table(route_list, corridor, hours):
    out = []
    sub = base[(base["Corridor"]==corridor) & (base["Hour of Day"].isin(hours))]
    for r in route_list:
        ss = sub[sub["Route ID"]==r]
        if ss.empty: 
            continue
        phases = {}
        for p, gg in ss.groupby("Phase"):
            phases[p] = wavg(gg["Average Road Speed"], gg["Bus Trip Count"])
        if "Pre-ACE" in phases and "ACE only" in phases:
            out.append({
                "Route": r,
                "Corridor": corridor,
                "Pre Avg mph": phases["Pre-ACE"],
                "Post Avg mph (ACE only)": phases["ACE only"],
                "%Δ (ACE vs Pre)": (phases["ACE only"]-phases["Pre-ACE"])/phases["Pre-ACE"]*100
            })
    return pd.DataFrame(out).sort_values("Route")


In [None]:
print("\nPM Rush (4–6 PM) — Hunter stop (Lex 68 → 59 SB) — MEAN speeds")
display(mean_table(routes, "LEXINGTON AV/E 68 ST → LEXINGTON AV/E 59 ST", pm_hours).round(3))

print("\nPM Rush (4–6 PM) — Midtown South (3 AV 23 → 42 NB) — MEAN speeds")
display(mean_table(["M101","M102"], "3 AV/E 23 ST → 3 AV/E 42 ST", pm_hours).round(3))
