In [None]:
import pandas as pd
import numpy as np
import re
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from pathlib import Path

In [None]:
CSV_PATH = r"Traffic_Lines_-5158403106151148634.csv"
GEO_PATH = r"Traffic_Lines_-8205076846245197459.geojson"

CSV_ID_COL = "OBJECTID"
GEO_ID_COL = "OBJECTID"

OUT_BASENAME = Path("aadt_davidson")

In [None]:
CANDIDATE_ID_COLS = [
    "STATION","STATION_ID","STATIONID","COUNT_STATION","COUNT_STATION_ID",
    "LOC_ID","LOCATION_ID","SITE_NO","SITENO","POINT_ID","POINTID",
    "SEGMENT_ID","SEGMENTID","GISID","GIS_ID","ID","OBJECTID"
]

def _auto_id(cols):
    low = {c.lower(): c for c in cols}
    for cand in CANDIDATE_ID_COLS:
        if cand.lower() in low:
            return low[cand.lower()]
    return None

def _norm_id(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    s = s.str.replace(r"\s+", "", regex=True)
    s = s.str.upper()
    s = s.str.replace(r"^STATION[_\-]?", "", regex=True)
    return s

In [None]:
YEAR_COL_RE = re.compile(r"^(?:AADT[_\-\s]*)?(19|20)\d{2}$")

def _detect_year_cols(df: pd.DataFrame):
    yc = [c for c in df.columns if YEAR_COL_RE.match(str(c).strip())]
    if not yc:
        yc = [c for c in df.columns if re.fullmatch(r"(19|20)\d{2}", str(c).strip())]
    return yc

def _year_from(colname: str) -> int:
    m = re.search(r"(19|20)\d{2}", str(colname))
    if not m:
        raise ValueError(f"Could not parse year from column: {colname}")
    return int(m.group(0))

In [None]:
def _tidy_from_wide(df: pd.DataFrame, id_col: str, keep_cols=None) -> pd.DataFrame:
    keep_cols = keep_cols or []
    year_cols = _detect_year_cols(df)

    # Tall format (e.g., 'AADT Year' + 'AADT')
    if not year_cols:
        year_candidates = [c for c in df.columns if "year" in str(c).lower()]
        year_col = next((c for c in ["year","Year"] + year_candidates if c in df.columns), None)
        aadt_col = next((c for c in ["AADT","aadt","AADT_TOTAL","TOTAL_AADT","value"] if c in df.columns), None)
        if not (year_col and aadt_col):
            raise ValueError("Need a year-like column and an AADT column.")
        out = df[[id_col, year_col, aadt_col] + [c for c in keep_cols if c in df.columns]].copy()
        out = out.rename(columns={year_col: "year", aadt_col: "aadt"})
        out["year"] = pd.to_numeric(out["year"], errors="coerce").astype("Int64")
        out["aadt"] = pd.to_numeric(out["aadt"], errors="coerce")
        return out

    # Wide format (year columns across)
    long_df = df.melt(
        id_vars=[c for c in df.columns if c not in year_cols],
        value_vars=year_cols, var_name="year_raw", value_name="aadt"
    )
    long_df["year"] = long_df["year_raw"].apply(_year_from)
    long_df = long_df.drop(columns=["year_raw"])
    keep = [id_col, "year", "aadt"] + [c for c in keep_cols if c in long_df.columns]
    long_df = long_df[keep].copy()
    long_df["aadt"] = pd.to_numeric(long_df["aadt"], errors="coerce")
    return long_df

In [None]:
df_head = pd.read_csv(CSV_PATH, nrows=5)
print("CSV columns:", sorted(df_head.columns.tolist()))
csv_ids = [c for c in df_head.columns if c.upper() in CANDIDATE_ID_COLS]
print("Likely CSV IDs:", csv_ids)

gdf_head = gpd.read_file(GEO_PATH).head(5)
print("GEO columns:", sorted(gdf_head.columns.tolist()))
geo_ids = [c for c in gdf_head.columns if c.upper() in CANDIDATE_ID_COLS]
print("Likely GEO IDs:", geo_ids)

In [None]:
df = pd.read_csv(CSV_PATH)
df[CSV_ID_COL] = _norm_id(df[CSV_ID_COL])

keep_cols = [c for c in ["Route ID","ROUTE","ROAD_NAME","ROUTE_NAME","ROADWAY","DIR","DIRECTION"] if c in df.columns]
tidy = (_tidy_from_wide(df, id_col=CSV_ID_COL, keep_cols=keep_cols)
        .rename(columns={CSV_ID_COL: "station_id"}))
tidy = tidy[tidy["aadt"].notna()].copy()

OUT_BASENAME.parent.mkdir(parents=True, exist_ok=True)
tidy_out = OUT_BASENAME.with_suffix(".clean.csv")
tidy.to_csv(tidy_out, index=False)

print("Saved:", tidy_out, "| rows:", len(tidy))
tidy.head()


In [None]:
needed_ids = set(tidy["station_id"].unique())

gdf = gpd.read_file(GEO_PATH)
gdf = gdf[[GEO_ID_COL, "geometry"]]  # drop extras early
gdf[GEO_ID_COL] = _norm_id(gdf[GEO_ID_COL])
gdf = gdf[gdf[GEO_ID_COL].isin(needed_ids)].drop_duplicates(subset=[GEO_ID_COL])

print("Geometry rows after filter:", len(gdf))

In [None]:
if gdf.crs is None:
    gdf = gdf.set_crs(epsg=4326)
elif getattr(gdf.crs, "to_epsg", lambda: None)() != 4326:
    gdf = gdf.to_crs(epsg=4326)

gdf = gdf.rename(columns={GEO_ID_COL: "station_id"})
merged = gdf.merge(tidy, on="station_id", how="inner", sort=False)

print("Merged rows:", len(merged))
merged.head(3).drop(columns="geometry")

In [None]:
out_geojson = OUT_BASENAME.with_suffix(".geojson")
merged.to_file(out_geojson, driver="GeoJSON")
print("Wrote:", out_geojson)


In [None]:
year_counts = tidy.groupby("year").size().rename("rows").reset_index().sort_values("year")
year_counts


In [None]:
tidy.sort_values("aadt", ascending=False).head(10)


In [None]:
bad_years = tidy[~tidy["year"].between(1990, 2025)]["year"].unique()
print("Out-of-range years:", bad_years)

In [None]:
tidy["year"] = tidy["year"].astype(int)

# Now group and plot
totals = tidy.groupby("year", as_index=False)["aadt"].sum()

plt.figure(figsize=(8,4))
plt.plot(totals["year"], totals["aadt"], marker="o")
plt.title("Total AADT across stations by year")
plt.xlabel("Year")
plt.ylabel("Sum of AADT")
plt.xticks(totals["year"])  # show only the years present
plt.tight_layout()
#plt.show()

In [None]:
# Keep only plausible years (e.g., 1990–2025)
totals = (
    tidy[tidy["year"].between(1990, 2025)]
    .groupby("year", as_index=False)["aadt"]
    .sum()
)

plt.figure(figsize=(8,4))
plt.plot(totals["year"], totals["aadt"], marker="o")
plt.title("Total AADT across stations by year")
plt.xlabel("Year")
plt.ylabel("Sum of AADT")
plt.tight_layout()
#plt.show()


In [None]:
top_station = tidy["station_id"].value_counts().idxmax()
s = tidy[tidy["station_id"] == top_station].sort_values("year")

plt.figure(figsize=(8,4))
plt.plot(s["year"], s["aadt"])
plt.title(f"AADT trend — station {top_station}")
plt.xlabel("Year"); plt.ylabel("AADT")
plt.tight_layout(); #plt.show()


In [None]:
# Required fields & aliases
fields = ["station_id", "year", "aadt"]
aliases = ["Station", "Year", "AADT"]

# Optional fields (check for existence in merged)
optional = [
    ("Route ID", "Route ID"),
    ("ROUTE", "ROUTE"),
    ("ROAD_NAME", "Road Name"),
    ("ROUTE_NAME", "Route Name"),
    ("ROADWAY", "Roadway")
]

for col, alias in optional:
    if col in merged.columns:
        fields.append(col)
        aliases.append(alias)

# Now both lists match in length
m = folium.Map(location=[36.1627, -86.7816], zoom_start=10)
folium.GeoJson(
    merged.to_json(),
    name="AADT",
    tooltip=folium.features.GeoJsonTooltip(
        fields=fields,
        aliases=aliases,
        sticky=False
    ),
).add_to(m)
folium.LayerControl().add_to(m)
m


In [None]:
tidy["year"] = tidy["year"].astype(int)
print("Unique years in tidy:", sorted(tidy["year"].unique()))

In [None]:
topN = 25  # change as needed
top_segments = (
    tidy.sort_values("aadt", ascending=False)
        .head(topN)
        .copy()
)

top_segments[["station_id","year","aadt"] + [c for c in ["Route ID","ROUTE","ROAD_NAME","ROUTE_NAME","ROADWAY"] if c in top_segments.columns]]


In [None]:
plt.figure(figsize=(8,10))
plt.barh(
    top_segments["station_id"].astype(str)[::-1],
    top_segments["aadt"][::-1]
)
plt.title("Top AADT Segments — 2024")
plt.xlabel("AADT")
plt.ylabel("Station ID")
plt.tight_layout()
plt.show()


In [None]:
route_cols = [c for c in ["Route ID","ROUTE","ROUTE_NAME","ROAD_NAME","ROADWAY"] if c in tidy.columns]
agg_cols = ["aadt"]
if "VMT" in tidy.columns:
    agg_cols.append("VMT")

by_route = (
    tidy.groupby(route_cols, dropna=False)[agg_cols]
        .sum(numeric_only=True)
        .reset_index()
        .sort_values("aadt", ascending=False)
)

by_route.head(15)


In [None]:
# If you've already created `merged`, it’s 2024-only.
# If not, run your earlier merge steps first.
print("Merged rows (2024 only):", len(merged))


In [None]:
# Required tooltip fields
fields = ["station_id", "aadt"]
aliases = ["Station", "AADT"]

# Optional details if present
optional = [
    ("Route ID", "Route ID"),
    ("ROUTE", "ROUTE"),
    ("ROAD_NAME", "Road Name"),
    ("ROUTE_NAME", "Route Name"),
    ("ROADWAY", "Roadway"),
    ("VMT", "VMT")
]
for col, alias in optional:
    if col in merged.columns:
        fields.append(col)
        aliases.append(alias)

m = folium.Map(location=[36.1627, -86.7816], zoom_start=10)

# Simple style by quantiles of AADT
try:
    q = merged["aadt"].quantile([0, 0.5, 0.75, 0.9, 0.99, 1]).tolist()
    def style_fn(feature):
        a = feature["properties"].get("aadt", 0) or 0
        if a <= q[1]: w = 2
        elif a <= q[2]: w = 3
        elif a <= q[3]: w = 4
        elif a <= q[4]: w = 5
        else: w = 6
        return {"weight": w}
except Exception:
    def style_fn(_): return {"weight": 3}

folium.GeoJson(
    merged.to_json(),
    name="AADT 2024",
    style_function=style_fn,
    tooltip=folium.features.GeoJsonTooltip(fields=fields, aliases=aliases, sticky=False),
).add_to(m)

folium.LayerControl().add_to(m)
m


In [None]:
# Segment-level 2024
seg2024 = merged.drop(columns="geometry")
seg2024_out = OUT_BASENAME.with_suffix(".segments_2024.csv")
seg2024.to_csv(seg2024_out, index=False)
print("Wrote:", seg2024_out)

# Route-level 2024 (from by_route above)
route2024_out = OUT_BASENAME.with_suffix(".routes_2024.csv")
by_route.to_csv(route2024_out, index=False)
print("Wrote:", route2024_out)
