In [27]:
import pandas as pd
import numpy as np

In [28]:
df_cleaned = pd.read_csv("/content/df_cleaned.csv")

  df_cleaned = pd.read_csv("/content/df_cleaned.csv")


In [29]:
print("DECODING CATEGORICAL VARIABLES...")

def decode_all_categoricals(df):
    """
    Replace all numeric codes with descriptive labels
    """
    df_decoded = df.copy()

    # Define comprehensive mapping dictionaries
    mappings = {
        # Collision severity
        'collision_severity': {
            1: 'Fatal', 2: 'Serious', 3: 'Slight', -1: 'Data Missing'
        },

        # Day of week
        'day_of_week': {
            1: 'Sunday', 2: 'Monday', 3: 'Tuesday', 4: 'Wednesday',
            5: 'Thursday', 6: 'Friday', 7: 'Saturday', -1: 'Data Missing'
        },

        # Road class
        'first_road_class': {
            1: 'Motorway', 2: 'A_Road_Motorway', 3: 'A_Road', 4: 'B_Road',
            5: 'C_Road', 6: 'Unclassified', -1: 'Data Missing'
        },

        'second_road_class': {
            1: 'Motorway', 2: 'A_Road_Motorway', 3: 'A_Road', 4: 'B_Road',
            5: 'C_Road', 6: 'Unclassified', -1: 'Data Missing', 0: 'Not_at_Junction'
        },

        # Light conditions
        'light_conditions': {
            1: 'Daylight', 4: 'Darkness_Lights_Lit', 5: 'Darkness_Lights_Unlit',
            6: 'Darkness_No_Lighting', 7: 'Darkness_Lighting_Unknown', -1: 'Data Missing'
        },

        # Weather conditions
        'weather_conditions': {
            1: 'Fine_No_High_Winds', 2: 'Raining_No_High_Winds', 3: 'Snowing_No_High_Winds',
            4: 'Fine_High_Winds', 5: 'Raining_High_Winds', 6: 'Snowing_High_Winds',
            7: 'Fog_or_Mist', 8: 'Other_Weather', 9: 'Unknown_Weather', -1: 'Data Missing'
        },

        # Urban/rural
        'urban_or_rural_area': {
            1: 'Urban', 2: 'Rural', 3: 'Unallocated', -1: 'Data Missing'
        },

        # Junction control
        'junction_control': {
            0: 'Not_at_Junction', 1: 'Authorised_Person', 2: 'Traffic_Signals',
            3: 'Stop_Sign', 4: 'Give_Way_Uncontrolled', -1: 'Data Missing', 9: 'Unknown'
        },

        # Police attendance
        'did_police_officer_attend_scene_of_accident': {
            1: 'Police_Attended', 2: 'Police_Not_Attended',
            3: 'Self_Reported', -1: 'Data Missing'
        },

        # Trunk road
        'trunk_road_flag': {
            1: 'Trunk_Road', 2: 'Non_Trunk_Road', -1: 'Data Missing'
        },

        'road_type': {
            1: 'Roundabout',
            2: 'One_Way_Street',
            3: 'Dual_Carriageway',
            6: 'Single_Carriageway',
            7: 'Slip_Road',
            9: 'Unknown_Road_Type',
            12: 'One_Way_Slip_Road',
            -1: 'Data_Missing'
        },
    }

    # Apply all mappings
    decoded_count = 0
    for column_name, mapping_dict in mappings.items():
        if column_name in df_decoded.columns:
            new_column_name = f"{column_name}_decoded"
            df_decoded[new_column_name] = df_decoded[column_name].map(mapping_dict)
            decoded_count += 1
            print(f"Decoded {column_name} -> {new_column_name}")

    # Speed limit categories
    if 'speed_limit' in df_decoded.columns:
        df_decoded['speed_limit_category'] = pd.cut(
            df_decoded['speed_limit'],
            bins=[0, 20, 30, 40, 50, 60, 70, 1000],
            labels=['20_mph_or_less', '21-30_mph', '31-40_mph', '41-50_mph',
                   '51-60_mph', '61-70_mph', '70_plus_mph']
        )
        print("Created speed_limit_category")

    print(f"Decoded {decoded_count} categorical variables")
    return df_decoded

DECODING CATEGORICAL VARIABLES...


In [30]:
# Apply decoding
df_decoded = decode_all_categoricals(df_cleaned)

Decoded collision_severity -> collision_severity_decoded
Decoded day_of_week -> day_of_week_decoded
Decoded first_road_class -> first_road_class_decoded
Decoded second_road_class -> second_road_class_decoded
Decoded light_conditions -> light_conditions_decoded
Decoded weather_conditions -> weather_conditions_decoded
Decoded urban_or_rural_area -> urban_or_rural_area_decoded
Decoded junction_control -> junction_control_decoded
Decoded did_police_officer_attend_scene_of_accident -> did_police_officer_attend_scene_of_accident_decoded
Decoded trunk_road_flag -> trunk_road_flag_decoded
Decoded road_type -> road_type_decoded
Created speed_limit_category
Decoded 11 categorical variables


In [31]:
print("CREATING FINAL CLEAN DATASET...")

def create_final_dataset(df, output_filename="road_safety_analysis_ready.csv"):
    """
    Create the final clean dataset for download
    """
    df_final = df.copy()

    # 1. Remove original coded columns where we have decoded versions
    print("Removing original coded columns...")
    columns_to_remove = []
    for col in df_final.columns:
        if not col.endswith('_decoded') and not col.endswith('_category'):
            # Check if this column has a decoded version
            decoded_col = f"{col}_decoded"
            if decoded_col in df_final.columns:
                columns_to_remove.append(col)

    # Also remove the intermediate calculated columns
    intermediate_cols = ['number_of_vehicles', 'calculated_vehicle_count',
                        'number_of_casualties', 'calculated_total_casualties',
                        'severity_weight']

    for col in intermediate_cols:
        if col in df_final.columns and col not in columns_to_remove:
            columns_to_remove.append(col)

    df_final = df_final.drop(columns=columns_to_remove)
    print(f"   Removed {len(columns_to_remove)} original coded columns")

    # 2. Select and reorder columns for better organization
    print("Organizing final columns...")

    # Define the ideal column order
    column_order = [
        # Core identifiers
        'collision_index', 'collision_year', 'collision_ref_no',

        # Date and time
        'accident_date', 'accident_hour', 'time_period', 'accident_month',
        'day_of_week_decoded', 'is_weekend',

        # Location
        'longitude', 'latitude', 'local_authority_ons_district',
        'urban_or_rural_area_decoded', 'lsoa_of_accident_location',

        # Severity and risk
        'collision_severity_decoded', 'risk_score', 'risk_category',

        # Vehicle information
        'final_vehicle_count', 'unique_vehicle_types', 'avg_driver_age',
        'male_driver_ratio', 'unique_manoeuvres', 'unique_journey_purposes',

        # Casualty information
        'final_total_casualties', 'fatal_casualties_count', 'pedestrian_casualties_count',
        'avg_casualty_age', 'unique_casualty_types',

        # Road infrastructure
        'first_road_class_decoded', 'speed_limit', 'speed_limit_category',
        'road_type', 'trunk_road_flag_decoded', 'is_major_road', 'is_high_speed',

        # Junction information
        'junction_detail', 'junction_control_decoded', 'has_complex_junction',

        # Environmental conditions
        'light_conditions_decoded', 'weather_conditions_decoded',
        'poor_visibility', 'adverse_weather', 'poor_road_condition',

        # Additional features
        'police_force', 'did_police_officer_attend_scene_of_accident_decoded'
    ]

    # Filter to only existing columns
    existing_columns = [col for col in column_order if col in df_final.columns]

    # Add any remaining columns
    remaining_columns = [col for col in df_final.columns if col not in existing_columns]

    final_columns = existing_columns + remaining_columns
    df_final = df_final[final_columns]

    # 4. Generate summary report
    print("\n FINAL DATASET SUMMARY:")
    print(f"   • Total records: {len(df_final):,}")
    print(f"   • Total columns: {len(df_final.columns)}")
    print(f"   • File saved as: {output_filename}")

    # Count decoded columns
    decoded_cols = [col for col in df_final.columns if col.endswith('_decoded')]
    print(f"   • Decoded columns: {len(decoded_cols)}")

    if 'risk_category' in df_final.columns:
        high_risk = (df_final['risk_category'] == 'High Risk').sum()
        print(f"   • High-risk collisions: {high_risk:,} ({high_risk/len(df_final)*100:.1f}%)")

    return df_final


CREATING FINAL CLEAN DATASET...


In [32]:
# Create final dataset
final_data = create_final_dataset(df_decoded)

Removing original coded columns...
   Removed 12 original coded columns
Organizing final columns...

 FINAL DATASET SUMMARY:
   • Total records: 52,657
   • Total columns: 57
   • File saved as: road_safety_analysis_ready.csv
   • Decoded columns: 11
   • High-risk collisions: 13,597 (25.8%)


In [33]:
final_data.columns

Index(['collision_index', 'collision_year', 'collision_ref_no',
       'accident_date', 'accident_hour', 'time_period', 'accident_month',
       'day_of_week_decoded', 'is_weekend', 'longitude', 'latitude',
       'local_authority_ons_district', 'urban_or_rural_area_decoded',
       'lsoa_of_accident_location', 'collision_severity_decoded', 'risk_score',
       'risk_category', 'final_vehicle_count', 'unique_vehicle_types',
       'avg_driver_age', 'male_driver_ratio', 'unique_manoeuvres',
       'unique_journey_purposes', 'final_total_casualties',
       'fatal_casualties_count', 'pedestrian_casualties_count',
       'avg_casualty_age', 'unique_casualty_types', 'first_road_class_decoded',
       'speed_limit', 'speed_limit_category', 'trunk_road_flag_decoded',
       'is_major_road', 'is_high_speed', 'junction_detail',
       'junction_control_decoded', 'has_complex_junction',
       'light_conditions_decoded', 'weather_conditions_decoded',
       'poor_visibility', 'adverse_weather',

In [34]:
print("CREATING OPTIMIZED DATASET FOR MODEL TRAINING...")

def create_optimized_model_dataset(df, output_filename="road_safety_model_ready.csv"):
    """
    Create a clean dataset optimized for machine learning model training
    """
    df_optimized = df.copy()

    # 1. REMOVE DUPLICATE AND UNNECESSARY COLUMNS
    print("Removing duplicate and unnecessary columns...")

    columns_to_remove = [
        # Duplicate temporal columns (we have accident_date, accident_hour, etc.)
        'date', 'time', 'accident_year',  # duplicate of accident_date components

        # Duplicate location columns
        'location_easting_osgr', 'location_northing_osgr',  # duplicate of longitude/latitude
        'local_authority_district',  # duplicate of local_authority_ons_district

        # Unnecessary for model training
        'collision_index', 'collision_ref_no',  # unique identifiers
        'collision_year',  # redundant with accident_date
        'first_road_number', 'second_road_number',  # road numbers not useful for ML
        'local_authority_highway',  # duplicate authority info
        'police_force',  # not useful for prediction
    ]

    # Remove only columns that exist
    existing_columns_to_remove = [col for col in columns_to_remove if col in df_optimized.columns]
    df_optimized = df_optimized.drop(columns=existing_columns_to_remove)
    print(f"   Removed {len(existing_columns_to_remove)} duplicate/unnecessary columns")

    # 2. SELECT ONLY RELEVANT FEATURES FOR MODEL TRAINING
    print("Selecting relevant features for ML...")

    # Categorize features for ML
    target_variables = [
        'risk_score', 'risk_category', 'collision_severity_decoded'
    ]

    temporal_features = [
        'accident_date', 'accident_hour', 'time_period', 'accident_month',
        'day_of_week_decoded', 'is_weekend'
    ]

    geographic_features = [
        'longitude', 'latitude', 'local_authority_ons_district',
        'urban_or_rural_area_decoded', 'lsoa_of_accident_location'
    ]

    vehicle_features = [
        'final_vehicle_count', 'unique_vehicle_types', 'avg_driver_age',
        'male_driver_ratio', 'unique_manoeuvres', 'unique_journey_purposes'
    ]

    casualty_features = [
        'final_total_casualties', 'fatal_casualties_count', 'pedestrian_casualties_count',
        'avg_casualty_age', 'unique_casualty_types'
    ]

    road_features = [
        'first_road_class_decoded', 'speed_limit', 'speed_limit_category',
        'road_type', 'trunk_road_flag_decoded', 'is_major_road', 'is_high_speed'
    ]

    junction_features = [
        'junction_detail', 'junction_control_decoded', 'has_complex_junction',
        'second_road_class_decoded'
    ]

    environmental_features = [
        'light_conditions_decoded', 'weather_conditions_decoded',
        'poor_visibility', 'adverse_weather', 'poor_road_condition',
        'road_surface_conditions'
    ]

    other_features = [
        'pedestrian_crossing', 'special_conditions_at_site', 'carriageway_hazards',
        'did_police_officer_attend_scene_of_accident_decoded'
    ]

    # Combine all relevant features
    all_relevant_features = (
        temporal_features + geographic_features + vehicle_features +
        casualty_features + road_features + junction_features +
        environmental_features + other_features + target_variables
    )

    # Filter to only existing columns
    existing_relevant_features = [col for col in all_relevant_features if col in df_optimized.columns]

    # Add any remaining decoded columns we might have missed
    remaining_decoded = [col for col in df_optimized.columns
                        if col.endswith('_decoded') and col not in existing_relevant_features]

    final_features = existing_relevant_features + remaining_decoded

    # Create the optimized dataset
    df_model_ready = df_optimized[final_features].copy()

    # 3. FINAL DATA CLEANING
    print("Final data cleaning...")

    # Handle any remaining missing values in numeric columns
    numeric_columns = df_model_ready.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        if df_model_ready[col].isna().sum() > 0:
            df_model_ready[col] = df_model_ready[col].fillna(0)
            print(f"   Filled missing values in: {col}")

    # 4. REORDER COLUMNS FOR BETTER ORGANIZATION
    print("Reordering columns logically...")

    optimal_order = [
        # Temporal features
        'accident_date', 'accident_hour', 'time_period', 'accident_month',
        'day_of_week_decoded', 'is_weekend',

        # Geographic features
        'longitude', 'latitude', 'local_authority_ons_district',
        'urban_or_rural_area_decoded', 'lsoa_of_accident_location',

        # Road and infrastructure
        'first_road_class_decoded', 'speed_limit', 'speed_limit_category',
        'road_type', 'trunk_road_flag_decoded', 'is_major_road', 'is_high_speed',

        # Junction information
        'junction_detail', 'junction_control_decoded', 'has_complex_junction',
        'second_road_class_decoded',

        # Environmental conditions
        'light_conditions_decoded', 'weather_conditions_decoded',
        'road_surface_conditions', 'poor_visibility', 'adverse_weather',
        'poor_road_condition',

        # Vehicle information
        'final_vehicle_count', 'unique_vehicle_types', 'avg_driver_age',
        'male_driver_ratio', 'unique_manoeuvres', 'unique_journey_purposes',

        # Casualty information
        'final_total_casualties', 'fatal_casualties_count', 'pedestrian_casualties_count',
        'avg_casualty_age', 'unique_casualty_types',

        # Additional features
        'pedestrian_crossing', 'special_conditions_at_site', 'carriageway_hazards',
        'did_police_officer_attend_scene_of_accident_decoded',

        # Target variables
        'collision_severity_decoded', 'risk_score', 'risk_category'
    ]

    # Filter to existing columns and reorder
    existing_optimal = [col for col in optimal_order if col in df_model_ready.columns]
    remaining_columns = [col for col in df_model_ready.columns if col not in existing_optimal]

    df_model_ready = df_model_ready[existing_optimal + remaining_columns]

    # 6. GENERATE COMPREHENSIVE SUMMARY
    print("\nOPTIMIZED DATASET SUMMARY:")
    print(f"   • Total records: {len(df_model_ready):,}")
    print(f"   • Total features: {len(df_model_ready.columns)}")
    print(f"   • File: {output_filename}")

    # Feature categories count
    temporal_count = len([col for col in df_model_ready.columns if col in temporal_features])
    geographic_count = len([col for col in df_model_ready.columns if col in geographic_features])
    vehicle_count = len([col for col in df_model_ready.columns if col in vehicle_features])
    road_count = len([col for col in df_model_ready.columns if col in road_features])
    environmental_count = len([col for col in df_model_ready.columns if col in environmental_features])
    decoded_count = len([col for col in df_model_ready.columns if col.endswith('_decoded')])

    print(f"\nFEATURE BREAKDOWN:")
    print(f"   • Temporal features: {temporal_count}")
    print(f"   • Geographic features: {geographic_count}")
    print(f"   • Vehicle features: {vehicle_count}")
    print(f"   • Road features: {road_count}")
    print(f"   • Environmental features: {environmental_count}")
    print(f"   • Decoded columns: {decoded_count}")

    # Target variable info
    if 'risk_category' in df_model_ready.columns:
        high_risk = (df_model_ready['risk_category'] == 'High Risk').sum()
        print(f"   • High-risk cases: {high_risk:,} ({high_risk/len(df_model_ready)*100:.1f}%)")

    if 'collision_severity_decoded' in df_model_ready.columns:
        severity_counts = df_model_ready['collision_severity_decoded'].value_counts()
        print(f"   • Severity distribution: {dict(severity_counts)}")

    print(f" FIRST 5 COLUMNS: {list(df_model_ready.columns[:5])}")
    print(f"LAST 5 COLUMNS: {list(df_model_ready.columns[-5:])}")

    return df_model_ready


CREATING OPTIMIZED DATASET FOR MODEL TRAINING...


In [35]:
# Create the optimized dataset
df_model_ready = create_optimized_model_dataset(final_data, "road_safety_model_ready.csv")

Removing duplicate and unnecessary columns...
   Removed 11 duplicate/unnecessary columns
Selecting relevant features for ML...
Final data cleaning...
   Filled missing values in: unique_manoeuvres
   Filled missing values in: avg_casualty_age
   Filled missing values in: unique_casualty_types
Reordering columns logically...

OPTIMIZED DATASET SUMMARY:
   • Total records: 52,657
   • Total features: 46
   • File: road_safety_model_ready.csv

FEATURE BREAKDOWN:
   • Temporal features: 6
   • Geographic features: 5
   • Vehicle features: 6
   • Road features: 6
   • Environmental features: 6
   • Decoded columns: 11
   • High-risk cases: 13,597 (25.8%)
   • Severity distribution: {'Slight': np.int64(40336), 'Serious': np.int64(11691), 'Fatal': np.int64(630)}
 FIRST 5 COLUMNS: ['accident_date', 'accident_hour', 'time_period', 'accident_month', 'day_of_week_decoded']
LAST 5 COLUMNS: ['did_police_officer_attend_scene_of_accident_decoded', 'collision_severity_decoded', 'risk_score', 'risk_ca

In [36]:
df_model_ready.columns

Index(['accident_date', 'accident_hour', 'time_period', 'accident_month',
       'day_of_week_decoded', 'is_weekend', 'longitude', 'latitude',
       'local_authority_ons_district', 'urban_or_rural_area_decoded',
       'lsoa_of_accident_location', 'first_road_class_decoded', 'speed_limit',
       'speed_limit_category', 'trunk_road_flag_decoded', 'is_major_road',
       'is_high_speed', 'junction_detail', 'junction_control_decoded',
       'has_complex_junction', 'second_road_class_decoded',
       'light_conditions_decoded', 'weather_conditions_decoded',
       'road_surface_conditions', 'poor_visibility', 'adverse_weather',
       'poor_road_condition', 'final_vehicle_count', 'unique_vehicle_types',
       'avg_driver_age', 'male_driver_ratio', 'unique_manoeuvres',
       'unique_journey_purposes', 'final_total_casualties',
       'fatal_casualties_count', 'pedestrian_casualties_count',
       'avg_casualty_age', 'unique_casualty_types', 'pedestrian_crossing',
       'special_condit

In [37]:
cols_to_drop = [
    'lsoa_of_accident_location',
    'special_conditions_at_site',
    'carriageway_hazards',
    'unique_journey_purposes',
    'speed_limit_category'
]

df_finals = df_model_ready.drop(columns=cols_to_drop, errors='ignore')

df_finals.shape

(52657, 41)

In [38]:
df_finals.columns

Index(['accident_date', 'accident_hour', 'time_period', 'accident_month',
       'day_of_week_decoded', 'is_weekend', 'longitude', 'latitude',
       'local_authority_ons_district', 'urban_or_rural_area_decoded',
       'first_road_class_decoded', 'speed_limit', 'trunk_road_flag_decoded',
       'is_major_road', 'is_high_speed', 'junction_detail',
       'junction_control_decoded', 'has_complex_junction',
       'second_road_class_decoded', 'light_conditions_decoded',
       'weather_conditions_decoded', 'road_surface_conditions',
       'poor_visibility', 'adverse_weather', 'poor_road_condition',
       'final_vehicle_count', 'unique_vehicle_types', 'avg_driver_age',
       'male_driver_ratio', 'unique_manoeuvres', 'final_total_casualties',
       'fatal_casualties_count', 'pedestrian_casualties_count',
       'avg_casualty_age', 'unique_casualty_types', 'pedestrian_crossing',
       'did_police_officer_attend_scene_of_accident_decoded',
       'collision_severity_decoded', 'risk_scor

In [39]:
from google.colab import files

output_filename = "df_finals.csv"
df_model_ready.to_csv(output_filename, index=False)
files.download(output_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>