# 04_M101_CBD_vs_NonCBD.ipynb
Splits M101 into CBD (downtown) vs Non-CBD (uptown) and compares Pre-ACE, ACE-only, ACE+CBD across All-day, AM Rush, PM Rush. Produces tables + optional CSV exports.

In [2]:
import pandas as pd
import numpy as np
import re
from IPython.display import display


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load dataset
DATA_PATH = "hunter-speeds-ace-labeled.csv"  # <- change if needed
df = pd.read_csv(DATA_PATH)

# Basic parsing/flags
df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
df["Is Weekday"] = df["Timestamp"].dt.dayofweek.isin([0,1,2,3,4])
df["Corridor"] = df["Timepoint Stop Name"] + " → " + df["Next Timepoint Stop Name"]


In [None]:
# Phase windows (ACE 60-day warning; CBD start)
ACE_ANNOUNCE   = pd.Timestamp("2024-06-17")
ACE_FINE_START = pd.Timestamp("2024-08-16")
CBD_START      = pd.Timestamp("2025-01-05")

def phase_label(ts):
    if ts < ACE_ANNOUNCE:
        return "Pre-ACE"
    elif ACE_ANNOUNCE <= ts < ACE_FINE_START:
        return "ACE Warning (skip)"
    elif ACE_FINE_START <= ts < CBD_START:
        return "ACE only"
    else:
        return "ACE + CBD"

df["Phase"] = df["Timestamp"].apply(phase_label)
df["Phase"].value_counts(dropna=False)


In [None]:
def wavg(x, w):
    x, w = np.asarray(x), np.asarray(w)
    return np.average(x, weights=w) if len(x)>0 else np.nan

def wpercentile(x, w, q):
    x, w = np.asarray(x), np.asarray(w)
    if len(x)==0: 
        return np.nan
    idx = np.argsort(x); x, w = x[idx], w[idx]
    cumw = np.cumsum(w); cutoff = q * w.sum()
    k = np.searchsorted(cumw, cutoff, side="left")
    k = int(min(max(k,0), len(x)-1))
    return float(x[k])


In [None]:
# Keep M101 weekdays, drop warning phase
m101 = df[(df["Route ID"]=="M101") & df["Is Weekday"] & (df["Phase"]!="ACE Warning (skip)")].copy()

# Heuristic CBD tag based on corridor text (south of ~59th St cues)
CBD_KEYWORDS = [
    r"\b23\b", r"\b42\b", r"ASTOR", r"\b14\b", r"HOUSTON",
    r"\bE 1 ST\b", r"\bE 15 ST\b", r"\bE 34 ST\b", r"\bE 30 ST\b", r"\bE 39 ST\b"
]

def tag_cbd(corridor: str) -> str:
    if any(re.search(k, corridor, flags=re.IGNORECASE) for k in CBD_KEYWORDS):
        return "CBD"
    return "Non-CBD"

m101["CBD_Area"] = m101["Corridor"].apply(tag_cbd)
m101[["Corridor","CBD_Area"]].head()


In [None]:
allday = list(range(6,22))
am_hours = [7,8,9]
pm_hours = [16,17,18]

def summarize_zone(data, hours, label):
    sub = data[data["Hour of Day"].isin(hours)]
    g = sub.groupby(["CBD_Area","Phase"]).apply(
        lambda gg: pd.Series({
            "Mean mph": wavg(gg["Average Road Speed"], gg["Bus Trip Count"]),
            "P10 mph":  wpercentile(gg["Average Road Speed"], gg["Bus Trip Count"], 0.10),
            "Crawl<5 share": wavg((gg["Average Road Speed"]<5).astype(float), gg["Bus Trip Count"]),
            "Trips (weight sum)": gg["Bus Trip Count"].sum()
        })
    ).reset_index()
    g["Window"] = label
    return g


In [None]:
all_tbl = summarize_zone(m101, allday, "All-day (6–22)")
am_tbl  = summarize_zone(m101, am_hours, "AM Rush (7–9)")
pm_tbl  = summarize_zone(m101, pm_hours, "PM Rush (16–18)")

summary = pd.concat([all_tbl, am_tbl, pm_tbl], ignore_index=True)
display(summary.round(3))


In [None]:
def add_pct_changes(tbl: pd.DataFrame, metric: str):
    piv = tbl.pivot_table(index=["Window","CBD_Area"], columns="Phase", values=metric).reset_index()
    if set(["Pre-ACE","ACE only"]).issubset(piv.columns):
        piv[f"%Δ {metric} (ACE vs Pre)"] = (piv["ACE only"] - piv["Pre-ACE"]) / piv["Pre-ACE"] * 100
    if set(["ACE only","ACE + CBD"]).issubset(piv.columns):
        piv[f"%Δ {metric} (CBD vs ACE)"] = (piv["ACE + CBD"] - piv["ACE only"]) / piv["ACE only"] * 100
    return piv

mean_changes  = add_pct_changes(summary, "Mean mph")
p10_changes   = add_pct_changes(summary, "P10 mph")
crawl_changes = add_pct_changes(summary, "Crawl<5 share")

print("\nMean mph — % changes")
display(mean_changes.round(3))

print("\nP10 mph — % changes (reliability)")
display(p10_changes.round(3))

print("\nCrawl<5 share — % changes")
display(crawl_changes.round(4))


In [None]:
def segment_pm_table(data):
    sub = data[data["Hour of Day"].isin(pm_hours)].copy()
    g = (sub.groupby(["CBD_Area","Corridor","Direction","Phase"])
           .apply(lambda gg: wavg(gg["Average Road Speed"], gg["Bus Trip Count"]))
           .reset_index(name="Avg_mph"))
    piv = g.pivot_table(index=["CBD_Area","Corridor","Direction"], columns="Phase", values="Avg_mph").reset_index()
    # % deltas
    if {"Pre-ACE","ACE only"}.issubset(piv.columns):
        piv["%Δ (ACE vs Pre)"] = (piv["ACE only"] - piv["Pre-ACE"]) / piv["Pre-ACE"] * 100
    if {"ACE only","ACE + CBD"}.issubset(piv.columns):
        piv["%Δ (CBD vs ACE)"] = (piv["ACE + CBD"] - piv["ACE only"]) / piv["ACE only"] * 100
    return piv

pm_segments = segment_pm_table(m101)
display(pm_segments.sort_values(["CBD_Area","%Δ (CBD vs ACE)"], ascending=[True,False]).round(3).head(20))


In [None]:
summary.round(3).to_csv("M101_CBD_vs_NonCBD_Summary_ByWindow.csv", index=False)
mean_changes.round(3).to_csv("M101_CBD_vs_NonCBD_MeanPctChanges.csv", index=False)
p10_changes.round(3).to_csv("M101_CBD_vs_NonCBD_P10PctChanges.csv", index=False)
crawl_changes.round(4).to_csv("M101_CBD_vs_NonCBD_CrawlPctChanges.csv", index=False)
pm_segments.round(3).to_csv("M101_CBD_vs_NonCBD_PM_SegmentDrilldown.csv", index=False)

print("\nExports written:")
print(" - M101_CBD_vs_NonCBD_Summary_ByWindow.csv")
print(" - M101_CBD_vs_NonCBD_MeanPctChanges.csv")
print(" - M101_CBD_vs_NonCBD_P10PctChanges.csv")
print(" - M101_CBD_vs_NonCBD_CrawlPctChanges.csv")
print(" - M101_CBD_vs_NonCBD_PM_SegmentDrilldown.csv")
