In [1]:
import re
from pathlib import Path
import pandas as pd

try:
    import geopandas as gpd
    HAS_GPD = True
except Exception:
    HAS_GPD = False

In [2]:
CSV_PATH = r"Traffic_Lines_-5158403106151148634.csv"                 # e.g., "data/TDOT_AADT_Davidson.csv"
GEO_PATH = r"Traffic_Lines_-8205076846245197459.geojson"    # e.g., "data/TDOT_Traffic_Lines.geojson"
CSV_ID_COL = "STATION_ID"                       # ID column in CSV (change if needed)
GEO_ID_COL = "STATION_ID"                       # ID column in GeoJSON (change if needed)
OUT_BASENAME = r"aadt_davidson" # without extension; files will be added

In [3]:
CANDIDATE_ID_COLS = [
    "STATION","STATION_ID","STATIONID","COUNT_STATION","COUNT_STATION_ID",
    "LOC_ID","LOCATION_ID","SITE_NO","SITENO","POINT_ID","POINTID",
    "SEGMENT_ID","SEGMENTID","GISID","GIS_ID","ID"
]

In [4]:
def _norm_id(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    s = s.str.replace(r"\s+", "", regex=True)
    s = s.str.upper()
    s = s.str.replace(r"^STATION[_\-]?", "", regex=True)  # optional
    return s

In [5]:
YEAR_COL_RE = re.compile(r"^(?:AADT[_\-\s]*)?(19|20)\d{2}$")

In [6]:
def _detect_year_cols(df: pd.DataFrame):
    yc = [c for c in df.columns if YEAR_COL_RE.match(str(c).strip())]
    if not yc:
        yc = [c for c in df.columns if re.fullmatch(r"(19|20)\d{2}", str(c).strip())]
    return yc

In [7]:

def _year_from(colname: str) -> int:
    m = re.search(r"(19|20)\d{2}", str(colname))
    if not m:
        raise ValueError(f"Could not parse year from column: {colname}")
    return int(m.group(0))

In [8]:
def _tidy_from_wide(df: pd.DataFrame, id_col: str, keep_cols=None) -> pd.DataFrame:
    keep_cols = keep_cols or []
    year_cols = _detect_year_cols(df)
    if not year_cols:
        # assume already tall with 'year' and some aadt column
        year_col = "year" if "year" in df.columns else "Year" if "Year" in df.columns else None
        aadt_col = None
        for cand in ["AADT","aadt","AADT_TOTAL","TOTAL_AADT","value"]:
            if cand in df.columns:
                aadt_col = cand
                break
        if not (year_col and aadt_col):
            raise ValueError("Could not detect year/aadt columns in tall format.")
        out = df[[id_col, year_col, aadt_col] + [c for c in keep_cols if c in df.columns]].copy()
        out = out.rename(columns={year_col: "year", aadt_col: "aadt"})
        out["year"] = pd.to_numeric(out["year"], errors="coerce").astype("Int64")
        out["aadt"] = pd.to_numeric(out["aadt"], errors="coerce")
        return out

    long_df = df.melt(
        id_vars=[c for c in df.columns if c not in year_cols],
        value_vars=year_cols,
        var_name="year_raw",
        value_name="aadt"
    )
    long_df["year"] = long_df["year_raw"].apply(_year_from)
    long_df = long_df.drop(columns=["year_raw"])
    keep = [id_col, "year", "aadt"] + [c for c in keep_cols if c in long_df.columns]
    long_df = long_df[keep].copy()
    long_df["aadt"] = pd.to_numeric(long_df["aadt"], errors="coerce")
    return long_df

In [9]:
def _auto_id(cols):
    low = {c.lower(): c for c in cols}
    for cand in CANDIDATE_ID_COLS:
        if cand.lower() in low:
            return low[cand.lower()]
    return None

In [10]:
csv_path = Path(CSV_PATH)
geo_path = Path(GEO_PATH)
out_base = Path(OUT_BASENAME)

df = pd.read_csv(csv_path)

In [11]:
csv_id = CSV_ID_COL or _auto_id(df.columns)
if not csv_id:
    raise ValueError("Could not auto-detect an ID column in the CSV. Set CSV_ID_COL.")

In [12]:
df[csv_id] = _norm_id(df[csv_id])

keep_cols = [c for c in ["ROUTE","ROAD_NAME","ROUTE_NAME","ROADWAY","DIR","DIRECTION"] if c in df.columns]
tidy = _tidy_from_wide(df, id_col=csv_id, keep_cols=keep_cols).rename(columns={csv_id: "station_id"})
tidy = tidy[tidy["aadt"].notna()].copy()

KeyError: 'STATION_ID'

In [None]:
out_csv = out_base.with_suffix(".clean.csv")
out_csv.parent.mkdir(parents=True, exist_ok=True)
tidy.to_csv(out_csv, index=False)

In [None]:
merged = None
if HAS_GPD:
    gdf = gpd.read_file(geo_path)
    geo_id = GEO_ID_COL or _auto_id(gdf.columns)
    if not geo_id:
        raise ValueError("Could not auto-detect an ID column in the GeoJSON. Set GEO_ID_COL.")
    gdf[geo_id] = _norm_id(gdf[geo_id])
    gdf = gdf.rename(columns={geo_id: "station_id"})
    if gdf.crs is None:
        gdf = gdf.set_crs(epsg=4326)
    else:
        gdf = gdf.to_crs(epsg=4326)
    gdf = gdf.drop_duplicates(subset=["station_id"])
    merged = gdf.merge(tidy, on="station_id", how="inner")


In [None]:
out_gpkg = out_base.with_suffix(".gpkg")
    out_geojson = out_base.with_suffix(".geojson")
    try:
        merged.to_file(out_gpkg, layer="aadt", driver="GPKG")
    except Exception as e:
        print("Warning: could not write GeoPackage:", e)
    try:
        merged.to_file(out_geojson, driver="GeoJSON")
    except Exception as e:
        print("Warning: could not write GeoJSON:", e)

In [None]:
  summary = (
        merged[["station_id", "year", "aadt"]]
        .groupby(["station_id", "year"], as_index=False)
        .agg(aadt=("aadt", "mean"))
        .sort_values(["station_id", "year"])
    )
    out_summary_csv = out_base.with_suffix(".station_year.csv")
    summary.to_csv(out_summary_csv, index=False)

In [None]:
# --- quick sanity output ---
print("Saved:", out_csv)
if merged is not None:
    print("Merged rows:", len(merged))
    display(merged.head(3).drop(columns="geometry"))
else:
    print("Note: geopandas not installed or Geo step skipped. Install with `pip install geopandas` if you want the map-merge outputs.")