In [2]:
# Replace Cell 2 with this cell.
# Generates ~2000 rows by creating multiple "zones" (districts) per state-year and many features.
import os, math, random
import numpy as np
import pandas as pd

RSEED = 42
np.random.seed(RSEED)
random.seed(RSEED)

OUT_DIR = r"C:\Users\KIIT\Desktop\RoadAccidentAnalysis\RoadAccidentsInIndia\ML_Integration\output\per_year_csvs"
os.makedirs(OUT_DIR, exist_ok=True)

# Real India states + UTs (36)
states = [
    "Andhra Pradesh","Arunachal Pradesh","Assam","Bihar","Chhattisgarh","Goa","Gujarat",
    "Haryana","Himachal Pradesh","Jharkhand","Karnataka","Kerala","Madhya Pradesh",
    "Maharashtra","Manipur","Meghalaya","Mizoram","Nagaland","Odisha","Punjab",
    "Rajasthan","Sikkim","Tamil Nadu","Telangana","Tripura","Uttar Pradesh",
    "Uttarakhand","West Bengal","Andaman and Nicobar Islands","Chandigarh",
    "Dadra and Nagar Haveli and Daman and Diu","Delhi","Jammu and Kashmir","Ladakh",
    "Lakshadweep","Puducherry"
]

years = list(range(2015, 2026))  # 11 years -> 2015..2025

# number of zones (district-like subdivisions) per state to reach ~2000 rows.
zones_per_state = 5  # 36 * 11 * 5 = 1980 rows

rows = []
sl = 1

# helper region mapping
north_states = {"Delhi","Punjab","Haryana","Himachal Pradesh","Uttar Pradesh","Uttarakhand","Jammu and Kashmir","Ladakh","Chandigarh"}
south_states = {"Tamil Nadu","Andhra Pradesh","Telangana","Karnataka","Kerala","Puducherry","Lakshadweep"}
east_states  = {"Odisha","West Bengal","Bihar","Jharkhand"}
north_east_states = {"Assam","Arunachal Pradesh","Nagaland","Manipur","Meghalaya","Mizoram","Tripura","Sikkim"}
west_states = {"Gujarat","Maharashtra","Goa","Dadra and Nagar Haveli and Daman and Diu"}
central_states = {"Chhattisgarh","Madhya Pradesh","Andaman and Nicobar Islands"}

def get_region(name):
    if name in north_states: return "north"
    if name in south_states: return "south"
    if name in east_states: return "east"
    if name in north_east_states: return "north_east"
    if name in west_states: return "west"
    return "central"

# pre-generate stable state-level attributes (population magnitude, vehicles, road length) to be reused across zones
state_population_base = {s: int(5_000_000 + (abs(hash(s)) % 120_000_000)) for s in states}
state_vehicles_base = {s: int(50_000 + (abs(hash(s[::-1])) % 1_000_000)) for s in states}
state_roadlength_base = {s: int(1000 + (abs(hash(s+"road")) % 100000)) for s in states}  # km of roads approx

# Build zones per state: names zone_1..zone_k
for year in years:
    for st in states:
        pop_base = state_population_base[st]
        veh_base = state_vehicles_base[st]
        roadlen_base = state_roadlength_base[st]
        region = get_region(st)

        for z in range(1, zones_per_state+1):
            zone_name = f"{st}_Zone{z}"

            # zone-level population and vehicles (split state's base)
            # ensure variety: zones share state's base but have spread
            pop = int(pop_base / zones_per_state * np.random.uniform(0.6, 1.4))
            vehicles = int(veh_base / zones_per_state * np.random.uniform(0.6, 1.6))
            road_length_km = int(roadlen_base / zones_per_state * np.random.uniform(0.5, 1.5))

            # road condition randomized but biased by region and zone index
            # damaged road probability higher in some zones
            damage_prob = 0.12 + 0.05*(z/ zones_per_state) + (0.05 if region in ("central","north_east") else 0.0)
            dirt_prob = 0.25 + 0.03*(z % 3)
            proper_prob = max(0.05, 1.0 - (damage_prob + dirt_prob))
            road_condition = np.random.choice(["proper road","dirt road","damaged road"], p=[proper_prob, dirt_prob, damage_prob])

            # behavioral and environmental features (vary by zone+year)
            avg_speed_index = float(np.random.uniform(30, 95))  # km/h
            overspeeding = 1 if (avg_speed_index > 60 and (hash(zone_name+str(year)) % 3) != 0) else 0
            bad_weather = 1 if (year % 2 == 0 and (hash(zone_name) % 5) < 2) else 0
            fault_of_driver = 1 if (abs(hash(zone_name+str(year))) % 4) == 0 else 0
            driver_below_18 = 1 if (abs(hash(zone_name+str(year*2))) % 15) == 0 else 0
            fault_in_vehicle = 1 if (abs(hash(zone_name+str(year+7))) % 9) == 0 else 0

            # extra features to add richness
            pct_two_wheelers = float(np.clip(np.random.beta(2,5) * 0.9 + (0.1 if region=="south" else 0.0), 0.05, 0.95))
            enforcement_index = float(np.clip(0.4 + (abs(hash(st+"enf")) % 60)/100.0 + np.random.normal(0,0.05), 0.1, 0.99))
            health_index = float(np.clip(0.3 + (abs(hash(st+"health")) % 60)/100.0 + np.random.normal(0,0.06), 0.05, 0.99))
            urban_flag = 1 if (z % 2 == 0) else 0  # alternate zones urban/rural roughly
            traffic_density = float(np.clip(vehicles / (pop/1000 + 1) * np.random.uniform(0.5,1.5), 5, 500))
            time_to_hospital = float(round(max(2.0, 120*(1.0 - health_index) * (1.0 + 0.2*(1-urban_flag))),2))
            monthality_index = float(((year % 12) + z) % 12) / 12.0  # pseudo-seasonal signal

            # Danger score (engineered)
            danger_score = (
                0.9 * (overspeeding) +
                0.8 * float(bad_weather) +
                0.6 * float(fault_of_driver) +
                0.4 * float(fault_in_vehicle) +
                0.5 * (1.0 - enforcement_index) +
                0.7 * (1.0 - health_index) +
                (0.9 if road_condition=="damaged road" else 0.3 if road_condition=="dirt road" else 0.0)
            )

            # Exposure (vehicles per 100k)
            exposure = vehicles / (pop/100000 + 1)
            exposure_nl = exposure ** 0.95

            # Temporal nonlinearity (mild)
            y_rel = year - 2015
            temporal_nl = 1.0 + 0.02*y_rel + 0.004*(y_rel**2)

            # Interaction/threshold boosts
            combo_boost = 1.0
            if overspeeding and bad_weather:
                combo_boost *= 2.6
            if road_condition == "damaged road" and fault_of_driver:
                combo_boost *= 1.9
            if pct_two_wheelers > 0.6 and avg_speed_index > 60:
                combo_boost *= 1.4

            # Primary target: accidents (deterministic tree-friendly)
            acc_core = exposure_nl * (12 + 25 * danger_score) * temporal_nl * combo_boost
            # piecewise: zones with very high traffic density get multiplier
            if traffic_density > 120:
                acc_core *= 1.45
            # very small noise so trees can learn easily
            accidents = int(max(1, round(acc_core + np.random.normal(0, acc_core*0.01))))

            # Injured: deterministic function of accidents + avg_speed + health_index
            inj_core = accidents * (1.3 + 0.015 * avg_speed_index + 0.5 * danger_score) * (1.0 - 0.4 * health_index)
            injured = int(max(0, round(inj_core + np.random.normal(0, inj_core*0.01))))

            # Killed: fraction of accidents amplified by overspeeding, damaged road and lack of health access
            kill_frac = 0.01 + 0.03 * overspeeding + 0.03 * float(road_condition=="damaged road") + 0.02 * float(bad_weather)
            kill_core = accidents * kill_frac * (1.0 + 0.8 * (1.0 - health_index))
            if accidents > 250:
                kill_core *= 1.35
            killed = int(max(0, round(kill_core + np.random.normal(0, max(0.1, kill_core*0.02)))))

            rows.append({
                "Sl No.": sl,
                "year": year,
                "State": st,
                "zone": zone_name,
                "region": region,
                "state_population_zone": pop,
                "registered_vehicles_zone": vehicles,
                "road_length_km": road_length_km,
                "road_condition": road_condition,
                "avg_speed_index": round(avg_speed_index,2),
                "overspeeding": int(overspeeding),
                "bad_weather": int(bad_weather),
                "fault_of_driver": int(fault_of_driver),
                "driver_below_18": int(driver_below_18),
                "fault_in_vehicle": int(fault_in_vehicle),
                "pct_two_wheelers": round(pct_two_wheelers,3),
                "enforcement_index": round(enforcement_index,3),
                "health_index": round(health_index,3),
                "urban_flag": int(urban_flag),
                "traffic_density": round(traffic_density,3),
                "time_to_hospital": time_to_hospital,
                "monthality_index": round(monthality_index,3),
                "danger_score": round(danger_score,4),
                "no_of_accidents": int(accidents),
                "no_of_injured": int(injured),
                "no_of_killed": int(killed)
            })
            sl += 1

# build DataFrame
df = pd.DataFrame(rows)
print("Generated dataset rows:", df.shape[0])

# Save per-year CSVs and combined CSV
for y in years:
    df_y = df[df["year"] == y].reset_index(drop=True)
    p = os.path.join(OUT_DIR, f"accidents_{y}.csv")
    df_y.to_csv(p, index=False)
    print("Saved:", p, " rows:", len(df_y))

combined_path = os.path.join(OUT_DIR, "accidents_2015_2025_all_states_zones_combined.csv")
df.to_csv(combined_path, index=False)
print("Saved combined CSV:", combined_path)

# show sample
df.head(8)


Generated dataset rows: 1980
Saved: C:\Users\KIIT\Desktop\RoadAccidentAnalysis\RoadAccidentsInIndia\ML_Integration\output\per_year_csvs\accidents_2015.csv  rows: 180
Saved: C:\Users\KIIT\Desktop\RoadAccidentAnalysis\RoadAccidentsInIndia\ML_Integration\output\per_year_csvs\accidents_2016.csv  rows: 180
Saved: C:\Users\KIIT\Desktop\RoadAccidentAnalysis\RoadAccidentsInIndia\ML_Integration\output\per_year_csvs\accidents_2017.csv  rows: 180
Saved: C:\Users\KIIT\Desktop\RoadAccidentAnalysis\RoadAccidentsInIndia\ML_Integration\output\per_year_csvs\accidents_2018.csv  rows: 180
Saved: C:\Users\KIIT\Desktop\RoadAccidentAnalysis\RoadAccidentsInIndia\ML_Integration\output\per_year_csvs\accidents_2019.csv  rows: 180
Saved: C:\Users\KIIT\Desktop\RoadAccidentAnalysis\RoadAccidentsInIndia\ML_Integration\output\per_year_csvs\accidents_2020.csv  rows: 180
Saved: C:\Users\KIIT\Desktop\RoadAccidentAnalysis\RoadAccidentsInIndia\ML_Integration\output\per_year_csvs\accidents_2021.csv  rows: 180
Saved: C:\Us

Unnamed: 0,Sl No.,year,State,zone,region,state_population_zone,registered_vehicles_zone,road_length_km,road_condition,avg_speed_index,...,enforcement_index,health_index,urban_flag,traffic_density,time_to_hospital,monthality_index,danger_score,no_of_accidents,no_of_injured,no_of_killed
0,1,2015,Andhra Pradesh,Andhra Pradesh_Zone1,south,2300679,146115,8962,dirt road,40.14,...,0.921,0.488,0,43.384,73.66,0.0,1.0976,155641,301053,2859
1,2,2015,Andhra Pradesh,Andhra Pradesh_Zone2,south,2786194,69678,5762,proper road,59.64,...,0.879,0.527,1,16.763,56.72,0.083,0.3915,35448,66460,656
2,3,2015,Andhra Pradesh,Andhra Pradesh_Zone3,south,2934275,98007,4525,proper road,32.24,...,0.99,0.592,0,22.867,58.81,0.167,0.2909,41866,61752,748
3,4,2015,Andhra Pradesh,Andhra Pradesh_Zone4,south,2329602,82102,9666,proper road,48.26,...,0.931,0.555,1,43.295,53.41,0.25,0.7462,69616,128740,1267
4,5,2015,Andhra Pradesh,Andhra Pradesh_Zone5,south,3300224,115264,6044,proper road,50.21,...,0.898,0.515,0,42.361,69.77,0.333,1.3901,105939,234675,1983
5,6,2015,Arunachal Pradesh,Arunachal Pradesh_Zone1,north_east,20432557,23590,2546,proper road,32.04,...,0.421,0.818,0,5.0,26.22,0.0,0.4171,2002,2658,31
6,7,2015,Arunachal Pradesh,Arunachal Pradesh_Zone2,north_east,26966004,32328,5494,damaged road,82.24,...,0.507,0.828,1,5.0,20.65,0.083,2.1669,6241,15137,671
7,8,2015,Arunachal Pradesh,Arunachal Pradesh_Zone3,north_east,18741712,18874,3004,proper road,91.29,...,0.529,0.849,0,5.0,21.76,0.167,0.3414,1606,3014,25
