## Encoding Categorical Columns

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../data/agg_df.csv')

In [3]:
df.head()

Unnamed: 0,accident_number,date,day_of_week,hour,road_category,road_layout,num_lanes,reserved_lane,road_profile,road_shape,surface_condition,infrastructure,road_location,speed_limit,injury_severity,users_involved,light_conditions,department,urban_area,intersection_type,weather,collision_type,latitude,longitude
0,201900000001,2019-11-30,Saturday,1,Major Roads,Multi Lane,10,No value,Flat,Curved,Normal,Tunnel / Bridge,Road,70,4,3,Night,Seine-Saint-Denis,Outside urban area,No junction,Normal Visibility,2-car collision,48.89621,2.47012
1,201900000002,2019-11-30,Saturday,2,Major Roads,One Way,2,No value,Slope / Near Slope,Curved,Normal,No value,Road,70,4,1,Night,Seine-Saint-Denis,Outside urban area,No junction,Normal Visibility,Multi-car collision,48.9307,2.3688
2,201900000003,2019-11-28,Thursday,15,Major Roads,Multi Lane,8,No value,Flat,Curved,Normal,No value,Road,90,4,4,Day,Hauts-de-Seine,Outside urban area,No junction,Normal Visibility,Multi-car collision,48.935872,2.319174
3,201900000004,2019-11-30,Saturday,20,Major Roads,Multi Lane,5,No value,Flat,Straight,Normal,No value,Road,90,4,4,Night,Val-de-Marne,Outside urban area,No junction,Normal Visibility,Multi-car collision,48.817329,2.42815
4,201900000005,2019-11-30,Saturday,4,Major Roads,One Way,3,No value,Flat,Curved,Normal,Tunnel / Bridge,Road,90,4,3,Night,Val-de-Marne,Outside urban area,No junction,Normal Visibility,2-car collision,48.776362,2.433254


In [5]:
# Drop accident_number column, don't need this anymore

df.drop(['accident_number'], axis = 1, inplace = True)

In [6]:
# Get cateogrical columns
cat_cols = df.select_dtypes(include=["object"]).drop(columns=["date"]).columns.tolist()
cat_cols

['day_of_week',
 'road_category',
 'road_layout',
 'reserved_lane',
 'road_profile',
 'road_shape',
 'surface_condition',
 'infrastructure',
 'road_location',
 'light_conditions',
 'department',
 'urban_area',
 'intersection_type',
 'weather',
 'collision_type']

In [7]:
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [8]:
# Get numerical columns and drop the target column
# Remove hour column as this needs cyclical encoding
num_cols = df.select_dtypes(include=["number"]).drop(columns=["injury_severity", "hour"]).columns.tolist()
num_cols


['num_lanes', 'speed_limit', 'users_involved', 'latitude', 'longitude']

In [9]:
# Scale num cols
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[num_cols])
scaled_df = pd.DataFrame(scaled_features, columns=[f"{col}_scaled" for col in num_cols], index=df.index)

# Concatenate with original DataFrame
df_encoded = pd.concat([df_encoded, scaled_df], axis=1)

In [10]:
# Inserting to front of df
df_encoded.insert(5, "num_lanes_scaled", df_encoded.pop("num_lanes_scaled"))
df_encoded.insert(6, "speed_limit_scaled", df_encoded.pop("speed_limit_scaled"))
df_encoded.insert(7, "users_involved_scaled", df_encoded.pop("users_involved_scaled"))
df_encoded.insert(8, "latitude_scaled", df_encoded.pop("latitude_scaled"))
df_encoded.insert(9, "longitude_scaled", df_encoded.pop("longitude_scaled"))

In [11]:
# Cyclical encoding for hour
df_encoded["hour_sin"] = np.sin(2 * np.pi * df_encoded["hour"]/24)
df_encoded["hour_cos"] = np.cos(2 * np.pi * df_encoded["hour"]/24)

# Inserting to front of df
df_encoded.insert(2, "hour_sin", df_encoded.pop("hour_sin"))
df_encoded.insert(3, "hour_cos", df_encoded.pop("hour_cos"))

In [12]:
# Remove original columns
df_encoded.drop(columns=['num_lanes', 'speed_limit', 'users_involved'], inplace=True)

In [13]:
# Inserting to front of df
df_encoded.insert(2, "latitude", df_encoded.pop("latitude"))
df_encoded.insert(3, "longitude", df_encoded.pop("longitude"))
df_encoded.insert(4, "injury_severity", df_encoded.pop("injury_severity"))

In [14]:
df_encoded.head()

Unnamed: 0,date,hour,latitude,longitude,injury_severity,hour_sin,hour_cos,num_lanes_scaled,speed_limit_scaled,users_involved_scaled,latitude_scaled,longitude_scaled,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,road_category_Major Roads,road_category_Other / Off-Network,road_category_Secondary Roads,road_layout_One Way,road_layout_Two Way,reserved_lane_No value,reserved_lane_Reserved Lane,road_profile_Slope / Near Slope,road_shape_Straight,surface_condition_Wet / Slippery,infrastructure_No value,infrastructure_Other,infrastructure_Tunnel / Bridge,road_location_Other,road_location_Reserved Lanes,road_location_Road,light_conditions_Night,light_conditions_Twilight,department_Hauts-de-Seine,department_Paris,department_Seine-Saint-Denis,department_Seine-et-Marne,department_Val-d'Oise,department_Val-de-Marne,department_Yvelines,urban_area_Outside urban area,intersection_type_No junction,intersection_type_Other junction,intersection_type_Simple junction,weather_Reduced Traction,weather_Reduced Visibility,collision_type_Multi-car collision,collision_type_No collision
0,2019-11-30,1,48.89621,2.47012,4,0.258819,0.965926,4.220637,0.614804,0.721277,0.286293,0.648763,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,True,False,False,False,True,False,False,False,False,True,True,False,False,False,False,False,False
1,2019-11-30,2,48.9307,2.3688,4,0.5,0.866025,-0.513104,0.614804,-1.17996,0.476901,0.020106,False,True,False,False,False,False,True,False,False,True,False,True,False,True,False,False,True,False,False,False,False,True,True,False,False,False,True,False,False,False,False,True,True,False,False,False,False,True,False
2,2019-11-28,15,48.935872,2.319174,4,-0.707107,-0.707107,3.037202,1.523458,1.671896,0.505483,-0.287804,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,True,False
3,2019-11-30,20,48.817329,2.42815,4,-0.866025,0.5,1.262049,1.523458,1.671896,-0.14964,0.388354,False,True,False,False,False,False,True,False,False,False,False,True,False,False,True,False,True,False,False,False,False,True,True,False,False,False,False,False,False,True,False,True,True,False,False,False,False,True,False
4,2019-11-30,4,48.776362,2.433254,4,0.866025,0.5,0.078613,1.523458,0.721277,-0.376046,0.420021,False,True,False,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,True,False,False,True,True,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False


In [15]:
df_encoded.shape

(65093, 51)

In [16]:
df_encoded.to_csv('../data/encoded_df.csv', index=False)