In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

print("Environment ready ✅")


Environment ready ✅


In [2]:
data_path = "../data/raw/accidents.csv"

df = pd.read_csv(data_path)

print("Dataset loaded successfully ✅")
print("Shape:", df.shape)

df.head()


  df = pd.read_csv(data_path)


Dataset loaded successfully ✅
Shape: (2047256, 34)


Unnamed: 0,Accident_Index,1st_Road_Class,1st_Road_Number,2nd_Road_Class,2nd_Road_Number,Accident_Severity,Carriageway_Hazards,Date,Day_of_Week,Did_Police_Officer_Attend_Scene_of_Accident,Junction_Control,Junction_Detail,Latitude,Light_Conditions,Local_Authority_(District),Local_Authority_(Highway),Location_Easting_OSGR,Location_Northing_OSGR,Longitude,LSOA_of_Accident_Location,Number_of_Casualties,Number_of_Vehicles,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Year,InScotland
0,200501BS00001,A,3218.0,,0.0,Serious,,2005-01-04,Tuesday,1.0,Data missing or out of range,Not at junction or within 20 metres,51.489096,Daylight,Kensington and Chelsea,Kensington and Chelsea,525680.0,178240.0,-0.19117,E01002849,1,1,0.0,1.0,Metropolitan Police,Wet or damp,Single carriageway,,30.0,17:42,Urban,Raining no high winds,2005,No
1,200501BS00002,B,450.0,C,0.0,Slight,,2005-01-05,Wednesday,1.0,Auto traffic signal,Crossroads,51.520075,Darkness - lights lit,Kensington and Chelsea,Kensington and Chelsea,524170.0,181650.0,-0.211708,E01002909,1,1,0.0,5.0,Metropolitan Police,Dry,Dual carriageway,,30.0,17:36,Urban,Fine no high winds,2005,No
2,200501BS00003,C,0.0,,0.0,Slight,,2005-01-06,Thursday,1.0,Data missing or out of range,Not at junction or within 20 metres,51.525301,Darkness - lights lit,Kensington and Chelsea,Kensington and Chelsea,524520.0,182240.0,-0.206458,E01002857,1,2,0.0,0.0,Metropolitan Police,Dry,Single carriageway,,30.0,00:15,Urban,Fine no high winds,2005,No
3,200501BS00004,A,3220.0,,0.0,Slight,,2005-01-07,Friday,1.0,Data missing or out of range,Not at junction or within 20 metres,51.482442,Daylight,Kensington and Chelsea,Kensington and Chelsea,526900.0,177530.0,-0.173862,E01002840,1,1,0.0,0.0,Metropolitan Police,Dry,Single carriageway,,30.0,10:35,Urban,Fine no high winds,2005,No
4,200501BS00005,Unclassified,0.0,,0.0,Slight,,2005-01-10,Monday,1.0,Data missing or out of range,Not at junction or within 20 metres,51.495752,Darkness - lighting unknown,Kensington and Chelsea,Kensington and Chelsea,528060.0,179040.0,-0.156618,E01002863,1,1,0.0,0.0,Metropolitan Police,Wet or damp,Single carriageway,,30.0,21:13,Urban,Fine no high winds,2005,No


In [3]:
columns_needed = [
    'Accident_Severity',
    'Latitude',
    'Longitude',
    'Date',
    'Time',
    'Weather_Conditions',
    'Road_Surface_Conditions',
    'Light_Conditions',
    'Speed_limit',
    'Urban_or_Rural_Area'
]

df = df[columns_needed]

print("Columns selected ✅")
print("New shape:", df.shape)

df.head()


Columns selected ✅
New shape: (2047256, 10)


Unnamed: 0,Accident_Severity,Latitude,Longitude,Date,Time,Weather_Conditions,Road_Surface_Conditions,Light_Conditions,Speed_limit,Urban_or_Rural_Area
0,Serious,51.489096,-0.19117,2005-01-04,17:42,Raining no high winds,Wet or damp,Daylight,30.0,Urban
1,Slight,51.520075,-0.211708,2005-01-05,17:36,Fine no high winds,Dry,Darkness - lights lit,30.0,Urban
2,Slight,51.525301,-0.206458,2005-01-06,00:15,Fine no high winds,Dry,Darkness - lights lit,30.0,Urban
3,Slight,51.482442,-0.173862,2005-01-07,10:35,Fine no high winds,Dry,Daylight,30.0,Urban
4,Slight,51.495752,-0.156618,2005-01-10,21:13,Fine no high winds,Wet or damp,Darkness - lighting unknown,30.0,Urban


In [4]:
print("Missing values before cleaning:\n")
print(df.isnull().sum())

# Drop rows missing critical spatial or severity info
df.dropna(subset=['Latitude', 'Longitude', 'Accident_Severity'], inplace=True)

print("\nAfter cleaning:")
print(df.isnull().sum())

print("\nFinal shape:", df.shape)


Missing values before cleaning:

Accident_Severity            0
Latitude                   174
Longitude                  175
Date                         0
Time                       156
Weather_Conditions           0
Road_Surface_Conditions      0
Light_Conditions             0
Speed_limit                 37
Urban_or_Rural_Area          0
dtype: int64

After cleaning:
Accident_Severity            0
Latitude                     0
Longitude                    0
Date                         0
Time                       156
Weather_Conditions           0
Road_Surface_Conditions      0
Light_Conditions             0
Speed_limit                 37
Urban_or_Rural_Area          0
dtype: int64

Final shape: (2047081, 10)


In [5]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Extract useful time features
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['day_of_week'] = df['Date'].dt.dayofweek

df[['Date', 'year', 'month', 'day_of_week']].head()


Unnamed: 0,Date,year,month,day_of_week
0,2005-01-04,2005,1,1
1,2005-01-05,2005,1,2
2,2005-01-06,2005,1,3
3,2005-01-07,2005,1,4
4,2005-01-10,2005,1,0


In [6]:
# Convert Time column to datetime (hour only)
df['hour'] = pd.to_datetime(df['Time'], format='%H:%M', errors='coerce').dt.hour

df[['Time', 'hour']].head()


Unnamed: 0,Time,hour
0,17:42,17.0
1,17:36,17.0
2,00:15,0.0
3,10:35,10.0
4,21:13,21.0


In [7]:
def map_risk(severity):
    if severity == 'Fatal':
        return 2   # High risk
    elif severity == 'Serious':
        return 1   # Medium risk
    else:
        return 0   # Low risk

df['accident_risk_level'] = df['Accident_Severity'].apply(map_risk)

df['accident_risk_level'].value_counts()


accident_risk_level
0    1734402
1     286310
2      26369
Name: count, dtype: int64

In [8]:
# Drop raw columns we no longer need
df.drop(columns=['Date', 'Time', 'Accident_Severity'], inplace=True)

print("Columns after cleanup:")
print(df.columns)

print("\nFinal dataset shape:", df.shape)


Columns after cleanup:
Index(['Latitude', 'Longitude', 'Weather_Conditions',
       'Road_Surface_Conditions', 'Light_Conditions', 'Speed_limit',
       'Urban_or_Rural_Area', 'year', 'month', 'day', 'day_of_week', 'hour',
       'accident_risk_level'],
      dtype='str')

Final dataset shape: (2047081, 13)


In [9]:
output_path = "../data/processed/accidents_cleaned.csv"

df.to_csv(output_path, index=False)

print("Cleaned dataset saved successfully ✅")


Cleaned dataset saved successfully ✅
