In [9]:
import os
from pathlib import Path
import pandas as pd

# ---------- utils ----------
def repo_root(start: Path | None = None) -> Path:
    """Walk upward until we find a folder that looks like the repo (has .git or README.md)."""
    cur = start or Path.cwd()
    for p in [cur, *cur.parents]:
        if (p / ".git").exists() or (p / "README.md").exists():
            return p
    return cur  # fallback: current dir

def read_csv_any(path: Path, **kwargs) -> pd.DataFrame:
    try:
        return pd.read_csv(path, engine="pyarrow", **kwargs)
    except Exception:
        return pd.read_csv(path, **kwargs)

def find_dataset_root(raw_base: Path) -> Path | None:
    """
    Try common names like 'dft_road_safety_last_5_years', 'road_safety_2023', etc.
    If ambiguous, pick the first dir with Collisions/Accidents/Vehicles/Casualties inside.
    """
    candidates = []
    if raw_base.exists():
        for d in raw_base.iterdir():
            if d.is_dir():
                # quick smell test: does it contain at least one of the expected files?
                has_expected = any(
                    d.joinpath(name).exists()
                    for name in ["Collisions.csv", "Accidents.csv", "Vehicles.csv", "Casualties.csv"]
                )
                looks_like = ("road" in d.name.lower() and "safety" in d.name.lower()) or has_expected
                if looks_like:
                    candidates.append(d)
    # Most users have exactly one relevant folder under data/raw — choose that.
    return candidates[0] if candidates else None

def find_file(root: Path, names: list[str]) -> Path | None:
    """Find the first file matching exact name (case-insensitive), then fallback to substring search."""
    all_csvs = list(root.rglob("*.csv")) + list(root.rglob("*.CSV"))
    # exact match first
    for nm in names:
        for p in all_csvs:
            if p.name.lower() == nm.lower():
                return p
    # substring fallback (handles e.g. Accidents_2021.csv)
    for nm in names:
        for p in all_csvs:
            if nm.lower().replace(".csv","") in p.name.lower():
                return p
    return None

# ---------- locate repo & data ----------
ROOT = repo_root()
RAW_BASE = ROOT / "data" / "raw"

# Optional override via env var if someone wants a custom location
override = os.getenv("UK_RS_DATA_DIR")
DATA_ROOT = Path(override) if override else find_dataset_root(RAW_BASE)

if not DATA_ROOT or not DATA_ROOT.exists():
    raise FileNotFoundError(
        f"Couldn’t find road-safety data under {RAW_BASE}. "
        "Make sure you unzipped the DfT files into a folder inside data/raw/ "
        "(e.g., data/raw/dft_road_safety_last_5_years/). "
        "You can also set UK_RS_DATA_DIR to point at your data folder."
    )

print("Repo root:", ROOT)
print("Data root:", DATA_ROOT)

# ---------- detect the three tables ----------
acc_path = find_file(DATA_ROOT, ["Collisions.csv", "Accidents.csv"])
veh_path = find_file(DATA_ROOT, ["Vehicles.csv"])
cas_path = find_file(DATA_ROOT, ["Casualties.csv"])

print("\nDetected:")
print("  Accident-level:", acc_path)
print("  Vehicles:      ", veh_path)
print("  Casualties:    ", cas_path)

# ---------- show head(10) in order ----------
def show_head(path: Path, label: str):
    df = read_csv_any(path)
    print(f"\n===== {label}: {path.name} =====")
    print("Path:", path.relative_to(ROOT))
    print("Shape:", df.shape)
    try:
        display(df.head(10))  # works in notebooks
    except NameError:
        print(df.head(10).to_string(index=False))  # fallback for terminals

missing = []
if acc_path: show_head(acc_path, "ACCIDENT-LEVEL")
else: missing.append("Accidents/Collisions")

if veh_path: show_head(veh_path, "VEHICLE-LEVEL")
else: missing.append("Vehicles")

if cas_path: show_head(cas_path, "CASUALTY-LEVEL")
else: missing.append("Casualties")

if missing:
    print("\n⚠️ Missing expected files:", ", ".join(missing))
    print("Check that the official DfT ZIPs were extracted and file names weren’t changed.")


Repo root: c:\Users\James\Documents\GitHub\uk-feedback-ai
Data root: c:\Users\James\Documents\GitHub\uk-feedback-ai\data\raw\dft_road_safety_last_5_years

Detected:
  Accident-level: c:\Users\James\Documents\GitHub\uk-feedback-ai\data\raw\dft_road_safety_last_5_years\Collisions.csv
  Vehicles:       c:\Users\James\Documents\GitHub\uk-feedback-ai\data\raw\dft_road_safety_last_5_years\Vehicles.csv
  Casualties:     c:\Users\James\Documents\GitHub\uk-feedback-ai\data\raw\dft_road_safety_last_5_years\Casualties.csv

===== ACCIDENT-LEVEL: Collisions.csv =====
Path: data\raw\dft_road_safety_last_5_years\Collisions.csv
Shape: (503475, 44)


Unnamed: 0,collision_index,collision_year,collision_ref_no,location_easting_osgr,location_northing_osgr,longitude,latitude,police_force,collision_severity,number_of_vehicles,...,carriageway_hazards_historic,carriageway_hazards,urban_or_rural_area,did_police_officer_attend_scene_of_accident,trunk_road_flag,lsoa_of_accident_location,enhanced_severity_collision,collision_injury_based,collision_adjusted_severity_serious,collision_adjusted_severity_slight
0,2020170H10890,2020,170H10890,446191.0,534540.0,-1.284731,54.703781,17,3,2,...,0,0,2,1,2,E01011959,-1,0,0.230918,0.769082
1,2021170H10801,2021,170H10801,449617.0,528977.0,-1.232514,54.653469,17,3,2,...,0,0,1,1,2,E01011961,-1,0,0.154276,0.845724
2,2021170M30061,2021,170M30061,454030.0,513371.0,-1.166988,54.512785,17,3,2,...,0,0,2,1,2,E01032593,-1,0,0.048385,0.951615
3,2022170M10952,2022,170M10952,449180.0,520909.0,-1.240623,54.581003,17,2,1,...,0,0,1,2,2,E01033468,-1,0,1.0,0.0
4,202417M114824,2024,17M114824,451027.0,515965.0,-1.21291,54.53639,17,3,2,...,-1,0,1,3,2,E01012049,3,1,0.0,1.0
5,2020170L20690,2020,170L20690,455573.0,519900.0,-1.141929,54.571288,17,3,1,...,0,0,1,1,2,E01033471,-1,0,0.117011,0.882989
6,2023170L30453,2023,170L30453,456584.0,522423.0,-1.125794,54.593835,17,3,3,...,0,13,2,2,2,E01032560,-1,0,0.014215,0.985785
7,2021170S10861,2021,170S10861,444348.0,519418.0,-1.315613,54.568062,17,3,2,...,0,0,1,1,2,E01012266,-1,0,0.091934,0.908066
8,2022170S11032,2022,170S11032,438095.0,516639.0,-1.412665,54.543586,17,2,3,...,0,0,2,1,1,E01012269,-1,0,1.0,0.0
9,2021111128189,2021,111128189,428285.0,514534.0,-1.564506,54.52532,11,3,2,...,0,0,1,3,2,E01012316,3,1,0.0,1.0



===== VEHICLE-LEVEL: Vehicles.csv =====
Path: data\raw\dft_road_safety_last_5_years\Vehicles.csv
Shape: (920692, 32)


Unnamed: 0,collision_index,collision_year,collision_ref_no,vehicle_reference,vehicle_type,towing_and_articulation,vehicle_manoeuvre_historic,vehicle_manoeuvre,vehicle_direction_from,vehicle_direction_to,...,age_of_driver,age_band_of_driver,engine_capacity_cc,propulsion_code,age_of_vehicle,generic_make_model,driver_imd_decile,lsoa_of_driver,escooter_flag,driver_distance_banding
0,2020552100160,2020,552100160,1,2,0,18,19,2,6,...,20,4,125,1,16,-1,2,E01020575,0,-1
1,2020460936785,2020,460936785,3,9,0,3,3,8,4,...,41,7,3956,2,3,-1,10,E01024778,0,-1
2,2020010243652,2020,010243652,1,9,0,18,19,5,1,...,31,6,3956,2,3,-1,2,E01001477,0,-1
3,2020410958331,2020,410958331,1,9,0,18,19,5,1,...,23,5,3956,2,2,-1,7,E01023391,0,-1
4,2020010260928,2020,010260928,1,9,0,99,99,9,9,...,38,7,3956,2,3,-1,5,E01016494,0,-1
5,2020140992131,2020,140992131,2,3,0,18,19,3,7,...,24,5,125,1,4,-1,3,E01007357,0,-1
6,2020210952419,2020,210952419,1,5,0,18,19,3,7,...,61,9,1172,1,17,-1,3,E01029602,0,-1
7,2020470949502,2020,470949502,1,9,0,9,9,3,1,...,41,7,2993,2,16,BMW X5,6,E01031408,0,-1
8,2020010262206,2020,010262206,1,9,0,5,5,1,5,...,34,6,2993,2,12,BMW X5,7,E01001125,0,-1
9,202006A271137,2020,06A271137,2,9,0,18,19,8,4,...,38,7,2993,2,3,BMW X5,1,E01005108,0,-1



===== CASUALTY-LEVEL: Casualties.csv =====
Path: data\raw\dft_road_safety_last_5_years\Casualties.csv
Shape: (640522, 23)


Unnamed: 0,collision_index,collision_year,collision_ref_no,vehicle_reference,casualty_reference,casualty_class,sex_of_casualty,age_of_casualty,age_band_of_casualty,casualty_severity,...,bus_or_coach_passenger,pedestrian_road_maintenance_worker,casualty_type,casualty_imd_decile,lsoa_of_casualty,enhanced_casualty_severity,casualty_injury_based,casualty_adjusted_severity_serious,casualty_adjusted_severity_slight,casualty_distance_banding
0,2020010280094,2020,010280094,1,1,3,2,24,5,3,...,0,0,0,3,E01003536,-1,0,0.0,1.0,-1
1,202031D109620,2020,31D109620,1,1,3,2,95,11,2,...,0,0,0,6,E01028156,-1,0,1.0,0.0,-1
2,2020401003715,2020,401003715,1,1,3,1,39,7,3,...,0,0,0,2,E01017492,3,1,0.0,1.0,-1
3,2021201086330,2021,201086330,1,1,3,1,63,9,3,...,0,2,0,3,E01009140,3,1,0.0,1.0,-1
4,2021371050963,2021,371050963,1,1,3,1,51,8,3,...,0,0,0,9,E01030190,3,1,0.0,1.0,-1
5,2022161255871,2022,161255871,1,1,3,2,60,9,3,...,0,0,0,2,E01033104,3,1,0.0,1.0,-1
6,2023201345434,2023,201345434,1,1,3,1,72,10,3,...,0,0,0,1,E01009409,3,1,0.0,1.0,-1
7,2024041434010,2024,041434010,1,1,3,1,39,7,2,...,0,-1,0,1,E01025154,5,1,1.0,0.0,3
8,2020140992628,2020,140992628,2,1,1,1,17,4,3,...,0,0,1,3,E01007591,3,1,0.0,1.0,-1
9,202031D139820,2020,31D139820,2,1,1,1,11,3,3,...,0,0,1,10,E01028218,-1,0,0.06941,0.93059,-1
