In [4]:
import pandas as pd

In [5]:
df_integrated = pd.read_csv("/content/df_integrated.csv", on_bad_lines='skip', low_memory=False)

In [2]:
import numpy as np

print("FEATURE ENGINEERING...")

def engineer_features(df):
    """
    Create meaningful features for analysis
    """
    df_engineered = df.copy()

    # 1. Temporal Features
    print("Creating temporal features...")

    # Date parsing with correct format (DD/MM/YYYY)
    df_engineered['accident_date'] = pd.to_datetime(df_engineered['date'], dayfirst=True, errors='coerce')

    # Time parsing
    df_engineered['accident_hour'] = pd.to_datetime(df_engineered['time'], format='%H:%M', errors='coerce').dt.hour

    # Extract time components
    df_engineered['accident_month'] = df_engineered['accident_date'].dt.month
    df_engineered['accident_year'] = df_engineered['accident_date'].dt.year
    df_engineered['is_weekend'] = df_engineered['day_of_week'].isin([1, 7]).astype(int)

    # Time of day categories
    df_engineered['time_period'] = pd.cut(
        df_engineered['accident_hour'],
        bins=[0, 6, 10, 16, 20, 24],
        labels=['Late Night', 'Morning Rush', 'Daytime', 'Evening Rush', 'Night'],
        include_lowest=True
    )

    # 2. Road Features
    print("Creating road features...")
    df_engineered['is_major_road'] = df_engineered['first_road_class'].isin([1, 2, 3]).astype(int)
    df_engineered['is_high_speed'] = (df_engineered['speed_limit'] >= 60).astype(int)
    df_engineered['has_complex_junction'] = df_engineered['junction_detail'].isin([16, 17]).astype(int)

    # 3. Environmental Features
    print("Creating environmental features...")
    df_engineered['poor_visibility'] = df_engineered['light_conditions'].isin([4, 5, 6, 7]).astype(int)
    df_engineered['adverse_weather'] = df_engineered['weather_conditions'].isin([2, 3, 4, 5, 6, 7]).astype(int)
    df_engineered['poor_road_condition'] = df_engineered['road_surface_conditions'].isin([2, 3, 4, 5]).astype(int)

    # 4. Resolve Count Discrepancies
    print("Resolving count discrepancies...")

    # Vehicle count - use maximum of reported vs calculated
    if 'number_of_vehicles' in df_engineered.columns and 'calculated_vehicle_count' in df_engineered.columns:
        df_engineered['final_vehicle_count'] = df_engineered[['number_of_vehicles', 'calculated_vehicle_count']].max(axis=1)
    elif 'number_of_vehicles' in df_engineered.columns:
        df_engineered['final_vehicle_count'] = df_engineered['number_of_vehicles']
    else:
        df_engineered['final_vehicle_count'] = df_engineered['calculated_vehicle_count']

    # Casualty count - use maximum of reported vs calculated
    if 'number_of_casualties' in df_engineered.columns and 'calculated_total_casualties' in df_engineered.columns:
        df_engineered['final_total_casualties'] = df_engineered[['number_of_casualties', 'calculated_total_casualties']].max(axis=1)
    elif 'number_of_casualties' in df_engineered.columns:
        df_engineered['final_total_casualties'] = df_engineered['number_of_casualties']
    else:
        df_engineered['final_total_casualties'] = df_engineered['calculated_total_casualties']

    # 5. Risk Scoring
    print("Creating risk scores...")
    severity_map = {1: 3, 2: 2, 3: 1}  # Fatal=3, Serious=2, Slight=1
    df_engineered['severity_weight'] = df_engineered['collision_severity'].map(severity_map)

    df_engineered['risk_score'] = (
        df_engineered['severity_weight'] * 3 +
        df_engineered['fatal_casualties_count'] * 5 +
        df_engineered['pedestrian_casualties_count'] * 4 +
        df_engineered['final_total_casualties'] * 1
    )

    # Risk categories
    risk_threshold = df_engineered['risk_score'].quantile(0.75)
    df_engineered['risk_category'] = np.where(
        df_engineered['risk_score'] >= risk_threshold, 'High Risk', 'Low Risk'
    )

    print("Feature engineering complete!")
    return df_engineered

FEATURE ENGINEERING...


In [6]:
# Apply feature engineering
df_with_features = engineer_features(df_integrated)

Creating temporal features...
Creating road features...
Creating environmental features...
Resolving count discrepancies...
Creating risk scores...
Feature engineering complete!


In [7]:
display(df_with_features.head())

Unnamed: 0,collision_index,collision_year,collision_ref_no,location_easting_osgr,location_northing_osgr,longitude,latitude,police_force,collision_severity,number_of_vehicles,...,is_high_speed,has_complex_junction,poor_visibility,adverse_weather,poor_road_condition,final_vehicle_count,final_total_casualties,severity_weight,risk_score,risk_category
0,202417H103224,2024,17H103224,448894,532505,-1.24312,54.68523,17,3,2,...,0,0,0,0,0,2.0,1,1,4,Low Risk
1,202417M217924,2024,17M217924,452135,519436,-1.19517,54.56747,17,2,2,...,0,1,1,0,0,2.0,1,2,7,Low Risk
2,202417S204524,2024,17S204524,445427,522924,-1.29837,54.59946,17,3,2,...,0,0,0,1,1,2.0,2,1,5,Low Risk
3,2024481510889,2024,481510889,533587,181174,-0.07626,51.51371,48,2,1,...,0,0,1,0,1,1.0,1,2,11,High Risk
4,2024481563500,2024,481563500,532676,180902,-0.08948,51.51148,48,2,1,...,0,0,0,0,0,1.0,2,2,12,High Risk


In [8]:
display(df_with_features.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52657 entries, 0 to 52656
Data columns (total 63 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   collision_index                              52657 non-null  object        
 1   collision_year                               52657 non-null  int64         
 2   collision_ref_no                             52657 non-null  object        
 3   location_easting_osgr                        52657 non-null  int64         
 4   location_northing_osgr                       52657 non-null  int64         
 5   longitude                                    52657 non-null  float64       
 6   latitude                                     52657 non-null  float64       
 7   police_force                                 52657 non-null  int64         
 8   collision_severity                           52657 non-null  int64         


None

In [9]:
columns_to_drop = [
    'date',
    'time',
    'number_of_vehicles',
    'number_of_casualties',
    'calculated_vehicle_count',
    'calculated_total_casualties'
]

df_cleaned = df_with_features.drop(columns=columns_to_drop)

print(f"Dropped {len(columns_to_drop)} redundant columns.")
display(df_cleaned.head())

Dropped 6 redundant columns.


Unnamed: 0,collision_index,collision_year,collision_ref_no,location_easting_osgr,location_northing_osgr,longitude,latitude,police_force,collision_severity,day_of_week,...,is_high_speed,has_complex_junction,poor_visibility,adverse_weather,poor_road_condition,final_vehicle_count,final_total_casualties,severity_weight,risk_score,risk_category
0,202417H103224,2024,17H103224,448894,532505,-1.24312,54.68523,17,3,6,...,0,0,0,0,0,2.0,1,1,4,Low Risk
1,202417M217924,2024,17M217924,452135,519436,-1.19517,54.56747,17,2,5,...,0,1,1,0,0,2.0,1,2,7,Low Risk
2,202417S204524,2024,17S204524,445427,522924,-1.29837,54.59946,17,3,5,...,0,0,0,1,1,2.0,2,1,5,Low Risk
3,2024481510889,2024,481510889,533587,181174,-0.07626,51.51371,48,2,5,...,0,0,1,0,1,1.0,1,2,11,High Risk
4,2024481563500,2024,481563500,532676,180902,-0.08948,51.51148,48,2,6,...,0,0,0,0,0,1.0,2,2,12,High Risk


In [10]:
display(df_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52657 entries, 0 to 52656
Data columns (total 57 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   collision_index                              52657 non-null  object        
 1   collision_year                               52657 non-null  int64         
 2   collision_ref_no                             52657 non-null  object        
 3   location_easting_osgr                        52657 non-null  int64         
 4   location_northing_osgr                       52657 non-null  int64         
 5   longitude                                    52657 non-null  float64       
 6   latitude                                     52657 non-null  float64       
 7   police_force                                 52657 non-null  int64         
 8   collision_severity                           52657 non-null  int64         


None