## Encoding Categorical Columns

In [108]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

In [109]:
df = pd.read_csv('../data/agg_df.csv')

In [110]:
# Drop accident_number column, don't need this anymore
# Dropped department because granularity is too fine.
# If treated as a categorical column, this would result in too many columns.
# It is also redundant with the region column.

df.drop(['accident_number', 'department'], axis = 1, inplace = True)

In [111]:
# Get cateogrical columns
cat_cols = df.select_dtypes(include=["object"]).drop(columns=["date"]).columns.tolist()
cat_cols

['day_of_week',
 'road_category',
 'road_layout',
 'reserved_lane',
 'road_profile',
 'road_shape',
 'surface_condition',
 'infrastructure',
 'road_location',
 'light_conditions',
 'urban_area',
 'intersection_type',
 'weather',
 'collision_type',
 'region']

In [112]:
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [113]:
# Get numerical columns and drop the target column
# Remove hour column as this needs cyclical encoding
num_cols = df.select_dtypes(include=["number"]).drop(columns=["injury_severity", "hour"]).columns.tolist()
num_cols


['num_lanes', 'speed_limit', 'users_involved', 'latitude', 'longitude']

In [114]:
# Scale num cols
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[num_cols])
scaled_df = pd.DataFrame(scaled_features, columns=[f"{col}_scaled" for col in num_cols], index=df.index)

# Concatenate with original DataFrame
df_encoded = pd.concat([df_encoded, scaled_df], axis=1)

In [115]:
# Inserting to front of df
df_encoded.insert(5, "num_lanes_scaled", df_encoded.pop("num_lanes_scaled"))
df_encoded.insert(6, "speed_limit_scaled", df_encoded.pop("speed_limit_scaled"))
df_encoded.insert(7, "users_involved_scaled", df_encoded.pop("users_involved_scaled"))
df_encoded.insert(8, "latitude_scaled", df_encoded.pop("latitude_scaled"))
df_encoded.insert(9, "longitude_scaled", df_encoded.pop("longitude_scaled"))

In [116]:
# Cyclical encoding for hour
df_encoded["hour_sin"] = np.sin(2 * np.pi * df_encoded["hour"]/24)
df_encoded["hour_cos"] = np.cos(2 * np.pi * df_encoded["hour"]/24)

# Inserting to front of df
df_encoded.insert(2, "hour_sin", df_encoded.pop("hour_sin"))
df_encoded.insert(3, "hour_cos", df_encoded.pop("hour_cos"))

In [117]:
# Remove original columns
df_encoded.drop(columns=['num_lanes', 'speed_limit', 'users_involved'], inplace=True)

In [118]:
# Inserting to front of df
df_encoded.insert(2, "latitude", df_encoded.pop("latitude"))
df_encoded.insert(3, "longitude", df_encoded.pop("longitude"))
df_encoded.insert(4, "injury_severity", df_encoded.pop("injury_severity"))

In [119]:
df_encoded.head()

Unnamed: 0,date,hour,latitude,longitude,injury_severity,hour_sin,hour_cos,num_lanes_scaled,speed_limit_scaled,users_involved_scaled,latitude_scaled,longitude_scaled,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,road_category_Major Roads,road_category_Other / Off-Network,road_category_Secondary Roads,road_layout_One Way,road_layout_Two Way,reserved_lane_No value,reserved_lane_Reserved Lane,road_profile_Slope / Near Slope,road_shape_Straight,surface_condition_Wet / Slippery,infrastructure_No value,infrastructure_Other,infrastructure_Tunnel / Bridge,road_location_Other,road_location_Reserved Lanes,road_location_Road,light_conditions_Night,light_conditions_Twilight,urban_area_Outside urban area,intersection_type_No junction,intersection_type_Other junction,intersection_type_Simple junction,weather_Reduced Traction,weather_Reduced Visibility,collision_type_Multi-car collision,collision_type_No collision,region_Bourgogne-Franche-Comté,region_Bretagne,region_Centre-Val de Loire,region_Grand Est,region_Hauts-de-France,region_Normandie,region_Nouvelle-Aquitaine,region_Occitanie,region_Pays de la Loire,region_Provence-Alpes-Côte d’Azur,region_Île-de-France
0,2019-11-30,1,48.89621,2.47012,4,0.258819,0.965926,5.875965,0.449901,0.618866,0.802672,-0.046186,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1,2019-11-30,2,48.9307,2.3688,4,0.5,0.866025,-0.368344,0.449901,-1.0479,0.818374,-0.087279,False,True,False,False,False,False,True,False,False,True,False,True,False,True,False,False,True,False,False,False,False,True,True,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True
2,2019-11-28,15,48.935872,2.319174,4,-0.707107,-0.707107,4.314888,1.366414,1.452249,0.820729,-0.107405,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True
3,2019-11-30,20,48.817329,2.42815,4,-0.866025,0.5,1.973272,1.366414,1.452249,0.766759,-0.063208,False,True,False,False,False,False,True,False,False,False,False,True,False,False,True,False,True,False,False,False,False,True,True,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True
4,2019-11-30,4,48.776362,2.433254,4,0.866025,0.5,0.412195,1.366414,0.618866,0.748108,-0.061138,False,True,False,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,True,False,False,True,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


In [120]:
df_encoded.shape

(204145, 55)

In [121]:
df_encoded.to_csv('../data/encoded_df.csv', index=False)