In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
import json
with open('../data/processed/feature_decoding.json', 'r') as f:
    features_decoding = json.load(f)
features_decoding

[{'circumstances': {'1': 'Traffic line during the accident',
   '2': 'First',
   '3': 'Second',
   '4': 'Third',
   '5': 'Fourth',
   '6': 'Fifth',
   '7': 'Sixth',
   '8': 'Oncoming traffic',
   '9': 'Roadside',
   '10': 'Action during the accident',
   '11': 'Beginning of traffic',
   '12': 'Direct traffic',
   '13': 'Rearrangement to right',
   '14': 'Rearrangement to left',
   '15': 'Steady condition',
   '16': 'Turn to right',
   '17': 'Backward motion',
   '18': 'Overtaking',
   '19': 'Breaking',
   '20': 'Turn to left',
   '21': 'Reverse',
   '22': 'Detour',
   '23': 'Parking',
   '24': 'Light signals during the accident',
   '25': 'Marker lights',
   '26': 'Alarm signal',
   '27': 'Fog lights',
   '28': 'Long',
   '29': 'Right blinker',
   '30': 'Short',
   '31': 'Left blinker',
   '32': 'Traffic light during the accident (used in traffic accidents at regulated crossroads)',
   '33': 'Red',
   '34': 'Red and yellow',
   '35': 'Green',
   '36': 'Yellow',
   '37': 'Not working',


In [3]:
df = pd.read_csv('../data/processed/data_bivac_filtered.csv')

# data preprocessing
cat_features = df.columns[df.columns.str.contains('id')]
for cat_col in cat_features:
    df[cat_col] = df[cat_col].astype(str)
df.accident_date = pd.to_datetime(df.accident_date)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18444 entries, 0 to 18443
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype                                
---  ------               --------------  -----                                
 0   compensation_status  18444 non-null  object                               
 1   accident_id          18444 non-null  object                               
 2   vehicle_id           18444 non-null  object                               
 3   fd_decision          18444 non-null  object                               
 4   circumstances        15657 non-null  object                               
 5   road_signs           110 non-null    object                               
 6   road_surface         18444 non-null  object                               
 7   visibility           18444 non-null  object                               
 8   road_conditions      18444 non-null  object                               
 9   accide

In [5]:
df.head()

Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id,circumstances_id,road_signs_id,road_surface_id,visibility_id,road_conditions_id
0,VALIDATED_EIF,16141,26245.0,NG,,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,,,1.0,2.0,1.0
1,VALIDATED_EIF,16141,26244.0,FG,First,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,2.0,,1.0,2.0,1.0
2,VALIDATED_EIF,16141,26244.0,FG,Direct traffic,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,12.0,,1.0,2.0,1.0
3,VALIDATED_EIF,16141,26244.0,FG,Green,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,35.0,,1.0,2.0,1.0
4,VALIDATED_EIF,16169,26324.0,NG,First,,Asphalt-concrete,Clear day,Wet,2020-02-01 18:00:00+04:00,141548,2.0,,1.0,1.0,2.0


In [6]:
print(f"Number of accidents: {df.accident_id.nunique()}")
print(f"Period: {df.accident_date.dt.date.min()} - {df.accident_date.dt.date.max()}")

Number of accidents: 4217
Period: 2018-01-11 - 2021-07-19


In [7]:
cat_features = ['circumstances', 'road_signs', 'road_surface', 'visibility', 'road_conditions']

num_cardinality = 0
dummy_col_names = []
for feature in cat_features:
    l_unique_vals = df[feature].unique()
    n_unique_vals = len(l_unique_vals)
    l_range_nunique = [i for i in range(n_unique_vals)] # noqa

    num_cardinality += n_unique_vals
    l_ind_value_name = [f"{feature}_{i}" for i in l_unique_vals]
    dummy_col_names = dummy_col_names + l_ind_value_name


print(f"Cardinality: {num_cardinality}")

Cardinality: 67


In [8]:
# One hot encoding
df_prep = pd.get_dummies(df, columns=cat_features + ['fd_decision'])

# daytime features
df_prep['hour'] = df_prep.accident_date.dt.hour
df_prep['day_of_month'] = df_prep.accident_date.dt.day
df_prep['month'] = df_prep.accident_date.dt.month
df_prep['year'] = df_prep.accident_date.dt.year

cols_to_drop = [f"{col}_id" for col in cat_features] + \
               [
    'compensation_status',
    'address_id',
    'accident_date',
    'accident_id'
               ]

df_prep = df_prep.drop(cols_to_drop, axis=1).set_index('vehicle_id')
print(df_prep.shape)
df_prep.head()

(18444, 71)


Unnamed: 0_level_0,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,circumstances_First,circumstances_Fog lights,circumstances_Fourth,...,road_conditions_Ice,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year
vehicle_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26245.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,2,2020
26244.0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,1,2,2020
26244.0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
26244.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
26324.0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,18,1,2,2020


## Model trials

In [10]:
print(df_prep.shape)
print(df_prep.reset_index().vehicle_id.nunique())

(18444, 71)
9011


In [11]:
df_prep

Unnamed: 0_level_0,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,circumstances_First,circumstances_Fog lights,circumstances_Fourth,...,road_conditions_Ice,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year
vehicle_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26245.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,2,2020
26244.0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,1,2,2020
26244.0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
26244.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
26324.0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,18,1,2,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566531.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,19,7,2021
566529.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021
566529.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021
566529.0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021


In [12]:
a = df_prep.reset_index()
a[a.vehicle_id == '26244.0']

Unnamed: 0,vehicle_id,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,circumstances_First,circumstances_Fog lights,...,road_conditions_Ice,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year
1,26244.0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,1,1,2,2020
2,26244.0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
3,26244.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020


In [39]:
from sklearn.tree import DecisionTreeClassifier


X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=42)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Unnamed: 0,accident_id,vehicle_id,accident_date,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,...,road_conditions_Ice,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year
0,16141,26245.0,2020-02-01 01:30:00+04:00,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,2,2020
1,16141,26244.0,2020-02-01 01:30:00+04:00,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
2,16141,26244.0,2020-02-01 01:30:00+04:00,0,0,0,0,0,1,0,...,0,0,0,0,1,0,1,1,2,2020
3,16141,26244.0,2020-02-01 01:30:00+04:00,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
4,16169,26324.0,2020-02-01 18:00:00+04:00,0,0,0,0,0,0,0,...,0,0,0,1,0,1,18,1,2,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18439,188892,566531.0,2021-07-19 01:40:00+04:00,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,19,7,2021
18440,188892,566529.0,2021-07-19 01:40:00+04:00,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021
18441,188892,566529.0,2021-07-19 01:40:00+04:00,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021
18442,188892,566529.0,2021-07-19 01:40:00+04:00,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021


In [32]:
df_prep_dummy.compensation_status.value_counts()

VALIDATED_EIF    18444
Name: compensation_status, dtype: int64