# Feature engineering

In [20]:
import geohash
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [21]:
start = time.time()

#chunk_size=100000
df = pd.read_csv("./data/accidents_before_eda_cleaned.csv")
merged_data = pd.DataFrame()

end = time.time()

#Subtract Start Time from The End Time
total_time = end - start
print(f"Loaded dataset in {round(total_time, 2)}s")

Loaded dataset in 3.29s


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2505924 entries, 0 to 2505923
Data columns (total 30 columns):
 #   Column   Dtype  
---  ------   -----  
 0   an       int64  
 1   mois     int64  
 2   jour     int64  
 3   hrmn     int64  
 4   col      float64
 5   lum      int64  
 6   agg      int64  
 7   int      int64  
 8   atm      float64
 9   catr     float64
 10  circ     float64
 11  nbv      float64
 12  vosp     float64
 13  prof     float64
 14  plan     float64
 15  surf     float64
 16  infra    float64
 17  situ     float64
 18  senc     float64
 19  catv     int64  
 20  obs      float64
 21  obsm     float64
 22  choc     float64
 23  manv     float64
 24  place    float64
 25  catu     int64  
 26  sexe     int64  
 27  an_nais  float64
 28  trajet   float64
 29  grav     float64
dtypes: float64(20), int64(10)
memory usage: 573.6 MB


## Encode predicted variable 'grav'

In [23]:
mapping = "ordinal"
if mapping == "ordinal":
    current_order = {"1": "Uninjured",
                     "2": "Killed",
                     "3": "Minor injuries",
                     "4": "Serious injuries"}

    new_mapping = {1: 1,
                   2: 4,
                   3: 2,
                   4: 3}

    df["grav"].astype(int)
    df["grav"] = df["grav"].map(new_mapping)

    grav_order = [1, 2, 3, 4]
    df["grav"] = pd.Categorical(df["grav"], categories=grav_order, ordered=True)

elif mapping == "binary":
    new_mapping = {1: 0,
                   2: 1,
                   3: 0,
                   4: 1}

    df["grav"] = df["grav"].map(new_mapping)

    grav_order = [1, 2, 3, 4]
    df["grav"] = pd.Categorical(df["grav"])


## Recreate codes commmunes

In [24]:
recreate_commune_codes = False
if recreate_commune_codes == True:
    df[df["an"] not in [2019, 2020, 2021]]["code_commune"] = df[df["an"] not in [2019, 2020, 2021]]

## Create geohash

In [25]:
geohash = False
if geohash == True:
    def fill_missing_lat_long(row):
        if pd.notnull(row["lat"]) and pd.notnull(row["long"]):
            return row["lat"], row["long"]
        else:
            city_code = row["com"]
            print(city_code)
            print(type(city_code))
            if city_code in codes_communes["code_commune"].unique():
                print(True)
                latitude = codes_communes_latitudes["city_code"]
                longitude = codes_communes_longitudes["city_code"]
                return latitude, longitude
            else:
                return row["lat"], row["long"]  # Return original values if city code is not found

    def create_geohash(row):
        latitude = row["lat"]
        longitude = row["long"]
    
        if pd.notna(latitude) and pd.notna(longitude):
            if -90 <= latitude <= 90 and -180 <= longitude <= 180:
                return geohash.encode(latitude, longitude)
            else:
                return np.nan
        else:
            return np.nan
    codes_communes = pd.read_csv("./data/codes_communes.csv", sep=",")
    codes_communes["code_commune"] = codes_communes["code_commune_INSEE"].astype(str)
    valid_codes = codes_communes["code_commune"].unique()
    df_missing = df[df["com"].isin(valid_codes)]
    df_missing = df[df["com"].notna()]
    
    codes_communes_latitudes = (codes_communes[["code_commune", "longitude"]]
                                .dropna()
                                .drop_duplicates(keep="first")
                                .set_index("code_commune")
                                .to_dict()
                               )
    
    codes_communes_longitudes = (codes_communes[["code_commune", "longitude"]]
                                .dropna()
                                .drop_duplicates(keep="first")
                                .set_index("code_commune")
                                .to_dict()
                                )
    #df["com"] = df["com"].astype(str)
    #codes_communes["code_commune"] = codes_communes["code_commune"].astype(str)
    
    #codes_communes = codes_communes.set_index("code_commune")[["latitude", "longitude"]].to_dict(orient="index")
    
    # Step 2: Update missing values in df using the city_code_mapping

    
    #Apply the custom function to create a new geohash column
    #df = df.dropna(subset=["lat", "long"])
    df["geohash"] = df.apply(create_geohash, axis=1)
    print(df)

    df_missing[["lat", "long"]] = df_missing.apply(fill_missing_lat_long, axis=1, result_type="expand")
    df["geohash"] = df.apply(create_geohash, axis=1)
    print(df)

## Create column "age"

In [26]:
df["age"] = df["an_nais"] - df["an"]
df =df.drop(columns=["an_nais"])

## Create temporal cyclical features (hour, day, month)

In [27]:
# Step 1: Data Preprocessing
df["hrmn"] = df["hrmn"].astype(str)
df["hrmn"] = df["hrmn"].apply(lambda x: x.zfill(4))
df["heure"] = df["hrmn"].str[:2].astype(int)
df["minute"] = df["hrmn"].str[2:].astype(int)

# Step 3: Sine and Cosine Transformations
max_month = 12
max_day_of_week = 7
max_hour = 24

# Sine and cosine transformations for time of the year (month)
df["month_sin"] = np.sin(2 * np.pi * df["mois"] / max_month)
df["month_cos"] = np.cos(2 * np.pi * df["mois"] / max_month)

# Sine and cosine transformations for time of the week (day of the week)
df["DayOfWeek_sin"] = np.sin(2 * np.pi * df["jour"] / max_day_of_week)
df["DayOfWeek_cos"] = np.cos(2 * np.pi * df["jour"] / max_day_of_week)

# Sine and cosine transformations for time of the day
df["TimeOfDay_sin"] = np.sin(2 * np.pi * (df["heure"] * 60 + df["minute"]) / (max_hour * 60))
df["TimeOfDay_cos"] = np.cos(2 * np.pi * (df["heure"] * 60 + df["minute"]) / (max_hour * 60))

## Create column timestamp

In [28]:
columns_to_convert = ["hrmn"]

df["timestamp_str"] = df.apply(lambda row: f"{row['an']:04d}-{row['mois']:02d}-{row['jour']:02d} {row['hrmn'][:2]}:{row['hrmn'][2:4]}:00", axis=1)

# Convert the "timestamp_str" column to datetime format
df["timestamp"] = pd.to_datetime(df["timestamp_str"], format="%Y-%m-%d %H:%M:%S")

## Reorder

In [29]:
df = df.sort_values(by="timestamp")

## Drop date columns

In [30]:
df = df.drop(columns=["timestamp_str", "timestamp", "an", "mois", "jour", "heure", "minute", "hrmn"])

## Additional cleaning - general EDA

- Most accidents happen because of a side crash, according to column 'col'
- Most accidents happen in broad daylight, according to column 'lum'
- About a third of accidents happen outside of urban areas, according to column column 'agg'
- Most accidents happen outside of crossroads, according to column 'int'
- Most accidents happen in normal atmospheric conditions, according to column 'atm'
- Most accidents happen on small roads (departmental roads or municipal roads), according to column 'catr'
- Most accidents happen on two-way roads, according to column 'nbv'
- The vast majority of accidents happen on roads that don't have a reserved way on the side (e.g. bicycle path), according to column 'vosp'
- The vast majority of accidents happen on flat roads, according to column 'prof'
- The vast majority of accidents happen on straight roads, according to column 'plan'
- One accident out of six happens on wet roads, and the vast majority of the rest five remaining sixth happens on dry roads, according to column 'surf'
- The vast majority of accidents don't happen on a special or specific intrastructure (e.g. bridge, tunnel) according to column 'infra'
- The vast majority of accidents happen right on the road (not on the sidewalk or bycicle path), according to column 'situ'
- The circulation direction is unknown for most of the observations, and in almost equal ascending and descending order for observations that do have this information, according to column 'senc'.
- Most accidents involve cars ("LV" means light vehicle), according to column 'catv'
- According to columns 'obs' and 'obsm'
- The column 'obs' doesn't apply in most of the observations
- Most of the time, accidents happen when hitting another car. The next most frequent occurence is an accident without hitting anything, and the third most frequent is when hitting a pedestrian, according to the columns 'obsm'
- Most crashes are frontal, according to column 'choc'
- Most accidents happen without a specific manoeuver or change in direction, according to column 'manv'
- The vast majority of people involved in accidents were the drivers, according to the column 'place'
- About two thirds of people involved in an accident are male, according to column 'sexe'

- About 60% of the accidents lead to no or minor injuries:
    - 42.92% of the people involved in accidents end up uninjured
    - 19.20% of the people involved in accidents end up with minor injuries
    - 35.27% of the people involved in accidents end up with major injuries
    - 2.61% of the people involved in accidents end up killed

After EDA, we will drop the columns listed below:
- The 'senc' column seems uninformative, on top of being an unlikely information to obtain during an emergency call.
- The column 'place' is more detailed than the column 'catu', but the majority of the values provide information that is already contained in 'catu'. We are dropping 'place' and keeping 'catu'.
- We are dropping 'trajet' as it contains too many missing values.
- The column 'circ' is not supposed to have zero values, we are dropping the rows where this is the case.

In [31]:
df = df.drop(["senc", "place", "trajet"], axis=1, errors="ignore")
df = df[df["circ"]!=0]

In [32]:
start = time.time()

df.to_csv("./data/accidents_general_eda_engineered.csv", index=False)

end = time.time()

total_time = end - start
print(f"Wrote cleaned dataset in {round(total_time, 2)}s")
print(f"Dataset has {len(df):,} rows after cleaning.")

Wrote cleaned dataset in 154.14s
Dataset has 2,401,090 rows after cleaning.


## Additional cleaning - fatal vs benign crashes EDA
- Frontal crashes are proportionally overrepresented in fatal accidents compared to benign ones, according to the column 'col'
- Crashes happening during the night without public lighting are proportionally overrepresented in fatal accidents compared to benign ones, according to tge column 'lum'
- Most benign crashes (one third) happen within urban centers. On the opposite, about three quarters of the fatal crashes happen outside of urban centers, according to column 'agg'.
- The differentiating power of the type of intersection on gravity seems negligible, according to the column 'int'.
- The differentiating power of the atmospheric conditions on gravity seems negligible, according to the column 'atm'.
- The majority of fatal crashes happen on departmental roads, according to column 'catr'.
- Most fatal accidents happen on one-way roads, according to column 'nbv'.
- Bidirectional roads are slightly more represented in fatal crashes than unidirectional or separated roads, according to column 'circ'.
- The differentiating power due to the presence or absence of special ways (e.g. sidewalk, bicycle path) seems negligible, according to the column 'vosp'.
- Curves on the left or right are slightly more represented in fatal crashes than benign ones, according to column 'plan'.
- The differentiating power of the state of the surface on gravity seems negligible, according to the column 'surf'.
- The differentiating power of the presence or absence of special infratructure (e.g. bridge, tunnel) on gravity seems negligible, according to the column 'infra'.
- Accidents located on the road shoulder are proportionally overrepresented in fatal accidents compared to benign ones, according to column 'situ'.
- Motorcycles, bicycles and mopeds are proportionally overrepresented in fatal accidents compared to benign ones, according to column 'catv'.
- If most fatal crashes don't involve an obstacle, trees, ditches, embankments, rock faces, buildings, walls, poles and bridge columns are types of obstacles that are proportionally overrepresented in fatal accidents compared to benign ones, according to column 'obs'. The vast majority of accidents leading to no injuries don't involve an obstacle at all.
- Column 'obsm' is hard to interpret. it doesn't seem to bring any differentiating power, we will drop it.
- Frontal collisions are proportionally overrepresented in fatal accidents compared to benign ones, according to column 'choc'.
- Swerving left is proportionally overrepresented in fatal accidents compared to benign ones, according to column 'manv'. This makes sense as in France people driveo n the right side of the road, so to overtake (when you're usually speeding with limited visibility), you have to swerve left. On highways, left lanes are also more likely to have faster vehicles than right lanes, so there is a higher chance to be hit behind or on the side when moving into the left lane.
- Pedestrians collisions are proportionally overrepresented in fatal accidents compared to benign ones, according to column 'catu'. This makes sense as pedestrians don't have a passenger compartment to absorb collisions and protect them.
- Men are slightly proportionally overrepresented in fatal accidents compared to benign ones, according to column 'sexe'. It might be due to chance, but due to the siwe of our dataset, we doubt it and we do think this column has some differentiating power.
- Older people are proportionally overrepresented in fatal accidents compared to benign ones, according to column 'age'. Younger people might be less experienced, but age being correlated with general health, it is not completely surprising to see that older poeple are less resistant to serious car crashes than younger ones.
- It is hard to deduce anything visually from the cyclical feature graphs alone.

In [33]:
df = df.drop(["int", "atm", "vosp", "surf", "infra", "obsm"], axis=1, errors="ignore")

## Save

In [34]:
start = time.time()

df.to_csv("./data/accidents_engineered_ordinal.csv", index=False)

end = time.time()

total_time = end - start
print(f"Wrote engineered ordinal dataset in {round(total_time, 2)}s")

Wrote engineered ordinal dataset in 135.22s


In [35]:
df.isna().sum() / len(df) * 100

col              0.157387
lum              0.000000
agg              0.000000
catr             0.000083
circ             0.952526
nbv              0.250511
prof             0.184583
plan             0.223898
situ             0.478658
catv             0.000000
obs              0.047353
choc             0.018533
manv             0.021074
catu             0.000000
sexe             0.000000
grav             0.000000
age              0.220358
month_sin        0.000000
month_cos        0.000000
DayOfWeek_sin    0.000000
DayOfWeek_cos    0.000000
TimeOfDay_sin    0.000000
TimeOfDay_cos    0.000000
dtype: float64

## Save binary

In [36]:
mapping = "binary"
if mapping == "binary":
    binary_mapping = {1: 0,
                      2: 0,
                      3: 1,
                      4: 1}

    df["grav"] = df["grav"].map(binary_mapping)
    
    grav_order = [1, 2, 3, 4]
    df["grav"] = pd.Categorical(df["grav"])

In [37]:
start = time.time()

df.to_csv("./data/accidents_engineered_binary.csv", index=False)

end = time.time()

total_time = end - start
print(f"Wrote engineered binary dataset in {round(total_time, 2)}s")

Wrote engineered binary dataset in 136.07s


In [38]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f"Last run at {current_time}")

Last run at 02:47:32
