In [None]:
import pandas as pd
import numpy as np
import re
import requests
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from pathlib import Path
import os

In [None]:
# defines canonical folders
OUT_DIR = Path("outputs"); OUT_DIR.mkdir(exist_ok=True)
FIG_DIR = Path("figures"); FIG_DIR.mkdir(exist_ok=True)

# traffic totals produced by Data Check.ipynb
TRAFFIC_TOTALS_CSV = OUT_DIR / "aadt_davidson_year_totals.csv"

In [None]:
# loads year totals, normalizes headers, validates coverage for 2013–2023
assert TRAFFIC_TOTALS_CSV.exists(), (
    f"Missing {TRAFFIC_TOTALS_CSV}. Open Data Check.ipynb and run either downloader first."
)

t = pd.read_csv(TRAFFIC_TOTALS_CSV)

# normalizes expected columns
if "AADT_YEAR" in t.columns and "year" not in t.columns:
    t = t.rename(columns={"AADT_YEAR": "year"})
if "aadt_total" not in t.columns and "AADT" in t.columns:
    t = (t.groupby("year", as_index=False)["AADT"]
           .sum()
           .rename(columns={"AADT": "aadt_total"}))

traffic_year = (
    t.assign(year=pd.to_numeric(t["year"], errors="coerce"),
             aadt_total=pd.to_numeric(t["aadt_total"], errors="coerce"))
     .dropna(subset=["year","aadt_total"])
     .assign(year=lambda d: d["year"].round().astype(int))
     .query("1990 <= year <= 2035")
     .drop_duplicates(subset=["year"], keep="last")
     .sort_values("year")
     .reset_index(drop=True)
)

years = traffic_year["year"].tolist()
missing = [y for y in range(2013, 2024) if y not in set(years)]
if missing:
    raise ValueError(
        f"Traffic totals missing years {missing}. "
        "Re-run the RAW POINTS (paged) downloader in Data Check.ipynb."
    )

print("traffic year coverage:", years[0], "→", years[-1], "| distinct years:", len(years))
display(traffic_year.head())

In [None]:
print("traffic years:", traffic_year["year"].min(), "→", traffic_year["year"].max())

In [None]:
# Builds pop_clean = [['year','population_total']] using local CSVs first, then fills missing years from ACS (2014–2023)
SEARCH_DIRS = [Path("."), Path("data"), Path("data_raw"), Path("outputs")]
TARGET_YEARS = list(range(2014, 2024))               # ACS 5-year end-years we want
GEOID_DAVIDSON = {"state": "47", "county": "037"}    # Tennessee / Davidson

def _read_csv_safe(p: Path):
    try:
        return pd.read_csv(p, low_memory=False)
    except Exception:
        try:
            return pd.read_csv(p, encoding="latin-1", low_memory=False)
        except Exception:
            return None

def _guess_year_from_filename(p: Path) -> int | None:
    m = re.search(r"(20\d{2}|19\d{2})", p.stem)
    return int(m.group(1)) if m else None

def _find_pop_col(df: pd.DataFrame) -> str | None:
    # prefer standard ACS total-pop columns
    preferred = ["B01003_001E", "DP05_0001E", "S0101_C01_001E"]
    for c in df.columns:
        if c in preferred: 
            return c
    # otherwise any column with 'pop' in the name, else widest numeric column
    poplike = [c for c in df.columns if "pop" in str(c).lower()]
    if poplike: 
        return poplike[0]
    num = df.select_dtypes(include=[np.number]).columns.tolist()
    return num[0] if num else None

def _select_davidson(df: pd.DataFrame) -> pd.DataFrame:
    df_cols = {c.lower(): c for c in df.columns}
    # by GEOID 47037 (often appears as 'state','county' OR 'GEO_ID'/'GEOID')
    if "state" in df_cols and "county" in df_cols:
        st, co = df_cols["state"], df_cols["county"]
        return df[(df[st].astype(str)==GEOID_DAVIDSON["state"]) & (df[co].astype(str)==GEOID_DAVIDSON["county"])]
    for key in ("GEOID","GEO_ID","geoid","geo_id"):
        if key in df.columns:
            mask = df[key].astype(str).str.contains("47037", na=False)
            sub = df[mask]
            if not sub.empty: return sub
    if "NAME" in df.columns:
        mask = df["NAME"].astype(str).str.contains("Davidson County", case=False, na=False) & \
               df["NAME"].astype(str).str.contains("Tennessee", case=False, na=False)
        sub = df[mask]
        if not sub.empty: return sub
    # last resort: return as-is
    return df

In [None]:
def _extract_year_row(p: Path, df: pd.DataFrame) -> dict | None:
    df = _select_davidson(df.copy())
    if df.empty: 
        return None
    pop_col = _find_pop_col(df)
    if pop_col is None: 
        return None
    year = None
    # try explicit year column
    for cand in df.columns:
        if cand.lower() in ("year","end_year","acs_year"):
            v = pd.to_numeric(df[cand], errors="coerce").dropna()
            if not v.empty and (1990 <= int(v.iloc[0]) <= 2035):
                year = int(v.iloc[0]); break
    # try filename year
    if year is None:
        year = _guess_year_from_filename(p)
    # try NAME like "... 2018 5-year"
    if year is None and "NAME" in df.columns:
        m = re.search(r"(20\d{2}|19\d{2})", " ".join(df["NAME"].astype(str).head(3).tolist()))
        if m: year = int(m.group(1))
    if year is None: 
        return None
    val = pd.to_numeric(df[pop_col], errors="coerce").dropna()
    if val.empty: 
        return None
    return {"year": year, "population_total": int(round(val.iloc[0]))}


In [None]:
local_rows = []
candidates = []
for base in SEARCH_DIRS:
    if base.exists():
        for p in base.rglob("*.csv"):
            # prioritize likely ACS files (your screenshot shows many "USC..." files too)
            if any(tok in p.name.lower() for tok in ["acs","dp05","b01003","population","usc"]):
                candidates.append(p)

seen_years = set()
for p in sorted(set(candidates)):
    df = _read_csv_safe(p)
    if df is None: 
        continue
    rec = _extract_year_row(p, df)
    if rec and rec["year"] in range(2010, 2036) and rec["year"] not in seen_years:
        local_rows.append(rec); seen_years.add(rec["year"])

pop_local = pd.DataFrame(local_rows).sort_values("year").reset_index(drop=True)
print("local population years found:", pop_local["year"].tolist() if not pop_local.empty else "NONE")

In [None]:
need_years = sorted(set(TARGET_YEARS) - set(pop_local["year"].tolist() if not pop_local.empty else []))
api_rows = []
for y in need_years:
    url = f"https://api.census.gov/data/{y}/acs/acs5"
    params = {"get": "NAME,B01003_001E", "for": f"county:{GEOID_DAVIDSON['county']}", "in": f"state:{GEOID_DAVIDSON['state']}"}
    if os.getenv("CENSUS_API_KEY"): params["key"] = os.getenv("CENSUS_API_KEY")
    r = requests.get(url, params=params, timeout=30); r.raise_for_status()
    js = r.json()
    api_rows.append({"year": y, "population_total": int(js[1][1])})

pop_api = pd.DataFrame(api_rows) if api_rows else pd.DataFrame(columns=["year","population_total"])


In [None]:
pop_clean = (pd.concat([pop_local, pop_api], ignore_index=True)
               .drop_duplicates(subset=["year"], keep="last")
               .sort_values("year")
               .reset_index(drop=True))

print("final ACS end-years:", pop_clean["year"].tolist())
display(pop_clean.tail(10))

# save for reuse
OUT_DIR = Path("outputs"); OUT_DIR.mkdir(exist_ok=True)
(pop_clean.sort_values("year")).to_csv(OUT_DIR / "pop_acs5_davidson_2014_2023.csv", index=False)

In [None]:
STATE_FIPS  = "47"   # Tennessee
COUNTY_FIPS = "037"  # Davidson
YEARS = list(range(2014, 2024))  # ACS 5-year end-years

rows = []
for y in YEARS:
    url = f"https://api.census.gov/data/{y}/acs/acs5"
    params = {
        "get": "NAME,B01003_001E",  # total population
        "for": f"county:{COUNTY_FIPS}",
        "in":  f"state:{STATE_FIPS}",
    }
    # optional: use your key if you have one set
    if os.getenv("CENSUS_API_KEY"):
        params["key"] = os.getenv("CENSUS_API_KEY")
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    rows.append({"year": y, "population_total": int(js[1][1])})

pop_clean = (pd.DataFrame(rows)
               .sort_values("year")
               .reset_index(drop=True))

# persist for reproducibility
(OUT_DIR / "pop_acs5_davidson_2014_2023.csv").write_text(
    pop_clean.to_csv(index=False)
)

print("ACS (county) end-years:", pop_clean["year"].tolist())
print("2023 population:", int(pop_clean.loc[pop_clean["year"]==2023,"population_total"].iloc[0]))
display(pop_clean.tail(10))

# sanity: fail fast if 2023 is not ~700k
val_2023 = int(pop_clean.loc[pop_clean["year"]==2023,"population_total"].iloc[0])
assert 650_000 <= val_2023 <= 800_000, f"Unexpected 2023 population {val_2023} — check ACS fetch."

In [None]:
# Population prep — creates pop_clean with ['year','population_total']
# Looks for a CSV under data/, data_raw/, or outputs/ that likely contains population by year.

SEARCH_DIRS = [Path("data"), Path("data_raw"), Path("outputs")]
candidates = []
for d in SEARCH_DIRS:
    if d.exists():
        candidates += [p for p in d.rglob("*.csv") if ("pop" in p.name.lower() or "acs" in p.name.lower())]

if not candidates:
    raise FileNotFoundError(
        "No candidate population CSVs found under data/, data_raw/, or outputs/. "
        "Put a CSV with 'pop' or 'acs' in the filename, or use the manual fallback (see below)."
    )

def _pick_year_col(df):
    # prefers explicit 'year' column; fallback = a column that looks like years 1990–2035
    for c in df.columns:
        if "year" in str(c).lower():
            return c
    for c in df.columns:
        s = pd.to_numeric(df[c], errors="coerce")
        if s.between(1990, 2035).mean() > 0.5:
            return c

def _pick_pop_col(df):
    # prefers a column name containing 'pop'; fallback = numeric column with the largest value range
    for c in df.columns:
        if "pop" in str(c).lower():
            return c
    best, best_span = None, -1
    for c in df.columns:
        s = pd.to_numeric(df[c], errors="coerce")
        if s.notna().mean() > 0.5:
            span = (s.max() - s.min()) if s.notna().any() else -1
            if span > best_span:
                best, best_span = c, span
    return best

pop_clean = None
used_path = None
for pth in candidates:
    try:
        dfp = pd.read_csv(pth, low_memory=False)
    except Exception:
        continue
    ycol = _pick_year_col(dfp)
    pcol = _pick_pop_col(dfp)
    if ycol is None or pcol is None:
        continue

    tmp = dfp[[ycol, pcol]].copy()
    tmp.columns = ["year", "population_total"]
    tmp["year"] = pd.to_numeric(tmp["year"], errors="coerce")
    tmp["population_total"] = pd.to_numeric(tmp["population_total"], errors="coerce")
    tmp = tmp.dropna(subset=["year","population_total"])
    if tmp.empty:
        continue

    pop_clean = (tmp.assign(year=lambda d: d["year"].round().astype(int))
                      .query("1990 <= year <= 2035")
                      .drop_duplicates(subset=["year"], keep="last")
                      .sort_values("year")
                      .reset_index(drop=True))
    used_path = pth
    break

if pop_clean is None:
    raise FileNotFoundError(
        "Could not auto-detect a population file with clear 'year' and 'population' columns.\n"
        "Manual fallback: set CSV_PATH below and build pop_clean from it."
    )

print("population source:", used_path)
display(pop_clean.head(10))

In [None]:
# builds the ACS 5-year end-year table expected by the merge
# NOTE: this expects pop_clean to exist with columns ['year','population_total'] (as prepared earlier in this notebook).
if "pop_clean" not in globals():
    raise NameError(
        "pop_clean is not defined. Run your population prep cell first to create pop_clean "
        "(with columns ['year','population_total'])."
    )

population_year = (
    pop_clean.rename(columns={"population_total": "population"})[["year","population"]]
             .assign(year=pd.to_numeric(lambda d: d["year"], errors="coerce"),
                     population=pd.to_numeric(lambda d: d["population"], errors="coerce"))
             .dropna(subset=["year","population"])
             .assign(year=lambda d: d["year"].round().astype(int))
             .query("1990 <= year <= 2035")
             .drop_duplicates(subset=["year"], keep="last")
             .sort_values("year")
             .reset_index(drop=True)
)

print("ACS end-years:", population_year["year"].astype(int).tolist())
display(population_year)

In [None]:
# --- diagnostics: why are the plots a single point? ---
# traffic coverage
t_years = traffic_year["year"].astype(int).sort_values().tolist()
print("traffic years:", t_years[0], "→", t_years[-1], "| count:", len(t_years))

# which end-years actually have a 5y mean?
ty_idx = traffic_year.set_index("year").sort_index()
t5 = ty_idx["aadt_total"].rolling(window=5, min_periods=5).mean()
have_5y = t5[t5.notna()].index.astype(int).tolist()
print("traffic 5y means available for end-years:", have_5y)

# population ACS end-years
p_years = population_year["year"].astype(int).sort_values().tolist()
print("population ACS end-years:", p_years)

# overlap that survives the merge
overlap = sorted(set(have_5y) & set(p_years))
print("overlap (plotted points):", overlap)

In [None]:
# aligns traffic to ACS 5-year end-years and computes per-capita metrics

# standardizes traffic and population shapes
traffic_year_fix = traffic_year.loc[:, ["year","aadt_total"]].copy()
population_year_fix = population_year.loc[:, ["year","population"]].copy()

# builds year-indexed traffic frame and computes strict 5-year rolling mean
ty = traffic_year_fix.set_index("year").sort_index()
ty["aadt_total_5y_mean"] = ty["aadt_total"].rolling(window=5, min_periods=5).mean()

# prepares merge keys (ACS uses end-year label)
traffic_5y = (ty.reset_index()
                .rename(columns={"year": "end_year"})
                .loc[:, ["end_year", "aadt_total_5y_mean"]])
pop_5y = population_year_fix.rename(columns={"year": "end_year"})

# merges and computes per-capita 5-year metric
df5 = (
    pop_5y.merge(traffic_5y, on="end_year", how="left")
          .assign(
              period=lambda d: d["end_year"].apply(lambda y: f"{y-4}–{y}"),
              aadt_per_capita_5y=lambda d: d["aadt_total_5y_mean"] / d["population"]
          )
          .dropna(subset=["aadt_total_5y_mean","population"])
          .sort_values("end_year")
          .reset_index(drop=True)
)

print("rows after 5-year alignment:", len(df5))
display(df5)


In [None]:
t5 = (traffic_year.set_index("year")["aadt_total"]
      .rolling(5, min_periods=5).mean().dropna())
overlap = sorted(set(t5.index.astype(int)) & set(population_year["year"].astype(int)))
print("5y traffic end-years:", t5.index.astype(int).tolist())
print("population end-years:", population_year["year"].astype(int).tolist())
print("overlap:", overlap)  # should show multiple: [2017, 2018, …, 2023]
print("2023 per-capita preview:",
      float(t5.loc[2023]) / float(population_year.set_index("year").loc[2023,"population"]))

In [None]:
# writes merged table and figures used in the deck

# CSV
merged_csv = OUT_DIR / "traffic_population_merged_5y.csv"
df5.to_csv(merged_csv, index=False)
print("saved:", merged_csv)

# figures
# 1) population trend (5y end-years)
plt.figure()
plt.plot(df5["end_year"], df5["population"], marker="o")
plt.title("Population (ACS 5-year end-years)")
plt.xlabel("End year")
plt.ylabel("Population")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(FIG_DIR/"population_trend_5y.png", dpi=200)

# 2) traffic trend (5y mean)
plt.figure()
plt.plot(df5["end_year"], df5["aadt_total_5y_mean"], marker="o")
plt.title("Traffic (5-year mean AADT)")
plt.xlabel("End year")
plt.ylabel("Total AADT (5-year mean)")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(FIG_DIR/"traffic_trend_5y.png", dpi=200)

# 3) population vs traffic (5y)
plt.figure()
plt.scatter(df5["population"], df5["aadt_total_5y_mean"])
for _, r in df5.iterrows():
    plt.annotate(str(int(r["end_year"])), (r["population"], r["aadt_total_5y_mean"]), xytext=(4,4), textcoords="offset points")
plt.title("Population vs Traffic (5-year mean)")
plt.xlabel("Population (ACS end-year)")
plt.ylabel("Total AADT (5-year mean)")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(FIG_DIR/"pop_vs_traffic_scatter_5y.png", dpi=200)

# 4) per-capita congestion proxy (5y)
plt.figure()
plt.plot(df5["end_year"], df5["aadt_per_capita_5y"], marker="o")
plt.title("Traffic per Capita (5-year mean)")
plt.xlabel("End year")
plt.ylabel("AADT per person")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(FIG_DIR/"per_capita_congestion_5y.png", dpi=200)

print("saved figures to:", FIG_DIR)

In [None]:
# assumes df5 exists with columns: end_year, population, aadt_total_5y_mean, aadt_per_capita_5y
import numpy as np

stats = {}
stats["Population Δ% (2017→2023)"] = (df5.loc[df5.end_year.eq(2023), "population"].iloc[0] /
                                      df5.loc[df5.end_year.eq(2017), "population"].iloc[0] - 1) * 100
stats["AADT Δ% (2017→2023)"] = (df5.loc[df5.end_year.eq(2023), "aadt_total_5y_mean"].iloc[0] /
                                df5.loc[df5.end_year.eq(2017), "aadt_total_5y_mean"].iloc[0] - 1) * 100
stats["AADT per capita Δ% (2017→2023)"] = (df5.loc[df5.end_year.eq(2023), "aadt_per_capita_5y"].iloc[0] /
                                           df5.loc[df5.end_year.eq(2017), "aadt_per_capita_5y"].iloc[0] - 1) * 100
corr = np.corrcoef(df5["population"], df5["aadt_total_5y_mean"])[0,1]
print({k: f"{v:.2f}%" for k,v in stats.items()})
print(f"Pop–AADT correlation (5y): {corr:.3f}")


In [None]:
x = df5["population"].values
y = df5["aadt_total_5y_mean"].values
m, b = np.polyfit(x, y, 1)

plt.figure()
plt.scatter(x, y)
for _, r in df5.iterrows():
    plt.annotate(str(int(r["end_year"])), (r["population"], r["aadt_total_5y_mean"]), xytext=(4,4), textcoords="offset points")
xx = np.linspace(x.min(), x.max(), 100)
plt.plot(xx, m*xx + b)
plt.title("Population vs Traffic (5-year mean) with trend line")
plt.xlabel("Population (ACS end-year)")
plt.ylabel("Total AADT (5-year mean)")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(FIG_DIR/"pop_vs_traffic_scatter_5y_trend2.png", dpi=200)

In [None]:
plt.figure()
plt.plot(df5["end_year"], df5["aadt_per_capita_5y"], marker="o")
plt.title("Traffic per Capita (5-year mean)")
plt.xlabel("End year"); plt.ylabel("AADT per person")
plt.axvspan(2020, 2021, alpha=0.15)  # lightly highlight COVID period
plt.grid(True, alpha=0.3); plt.tight_layout()
plt.savefig(FIG_DIR/"per_capita_congestion_5y_covidshade2.png", dpi=200)