In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/processed/data_bivac_filtered.csv')

# data preprocessing
cat_features = df.columns[df.columns.str.contains('id')]
for cat_col in cat_features:
    df[cat_col] = df[cat_col].astype(str)
df.accident_date = pd.to_datetime(df.accident_date)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18444 entries, 0 to 18443
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype                                
---  ------               --------------  -----                                
 0   compensation_status  18444 non-null  object                               
 1   accident_id          18444 non-null  object                               
 2   vehicle_id           18444 non-null  object                               
 3   fd_decision          18444 non-null  object                               
 4   circumstances        15657 non-null  object                               
 5   road_signs           110 non-null    object                               
 6   road_surface         18444 non-null  object                               
 7   visibility           18444 non-null  object                               
 8   road_conditions      18444 non-null  object                               
 9   accide

In [5]:
df.head()

Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id,circumstances_id,road_signs_id,road_surface_id,visibility_id,road_conditions_id
0,VALIDATED_EIF,16141,26245.0,NG,,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,,,1.0,2.0,1.0
1,VALIDATED_EIF,16141,26244.0,FG,First,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,2.0,,1.0,2.0,1.0
2,VALIDATED_EIF,16141,26244.0,FG,Direct traffic,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,12.0,,1.0,2.0,1.0
3,VALIDATED_EIF,16141,26244.0,FG,Green,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,35.0,,1.0,2.0,1.0
4,VALIDATED_EIF,16169,26324.0,NG,First,,Asphalt-concrete,Clear day,Wet,2020-02-01 18:00:00+04:00,141548,2.0,,1.0,1.0,2.0


In [6]:
print(f"Number of accidents: {df.accident_id.nunique()}")
print(f"Period: {df.accident_date.dt.date.min()} - {df.accident_date.dt.date.max()}")

Number of accidents: 4217
Period: 2018-01-11 - 2021-07-19


In [7]:
cat_features = ['circumstances', 'road_signs', 'road_surface', 'visibility', 'road_conditions']

num_cardinality = 0
dummy_col_names = []
for feature in cat_features:
    l_unique_vals = df[feature].unique()
    n_unique_vals = len(l_unique_vals)
    l_range_nunique = [i for i in range(n_unique_vals)] # noqa

    num_cardinality += n_unique_vals
    l_ind_value_name = [f"{feature}_{i}" for i in l_unique_vals]
    dummy_col_names = dummy_col_names + l_ind_value_name


print(f"Cardinality: {num_cardinality}")

Cardinality: 67


In [8]:
# One hot encoding
df_prep = pd.get_dummies(df, columns=cat_features + ['fd_decision'])

# daytime features
df_prep['hour'] = df_prep.accident_date.dt.hour
df_prep['day_of_month'] = df_prep.accident_date.dt.day
df_prep['month'] = df_prep.accident_date.dt.month
df_prep['year'] = df_prep.accident_date.dt.year

cols_to_drop = [f"{col}_id" for col in cat_features] + \
               [
    'compensation_status',
    'address_id',
    'accident_date',
    'accident_id'
               ]

df_prep = df_prep.drop(cols_to_drop, axis=1).set_index('vehicle_id')
print(df_prep.shape)
df_prep.head()

(18444, 71)


Unnamed: 0_level_0,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,circumstances_First,circumstances_Fog lights,circumstances_Fourth,...,road_conditions_Ice,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year
vehicle_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26245.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,2,2020
26244.0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,1,2,2020
26244.0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
26244.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
26324.0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,18,1,2,2020


## Model trials

In [9]:
print(df_prep.shape)
print(df_prep.reset_index().vehicle_id.nunique())

(18444, 71)
9011


In [10]:
df_prep

Unnamed: 0_level_0,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,circumstances_First,circumstances_Fog lights,circumstances_Fourth,...,road_conditions_Ice,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year
vehicle_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26245.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,2,2020
26244.0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,1,2,2020
26244.0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
26244.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
26324.0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,18,1,2,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566531.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,19,7,2021
566529.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021
566529.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021
566529.0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021


In [11]:
a = df_prep.reset_index()
a[a.vehicle_id == '26244.0']


Unnamed: 0,vehicle_id,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,circumstances_First,circumstances_Fog lights,...,road_conditions_Ice,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year
1,26244.0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,1,1,2,2020
2,26244.0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
3,26244.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020


In [13]:
from random import shuffle
l_unique_vehicles = list(df_prep.index.unique())
shuffle(l_unique_vehicles)

df_train = df_prep[df_prep.index.isin(l_unique_vehicles[:int(len(l_unique_vehicles)*0.7)])]
df_val = df_prep[df_prep.index.isin(l_unique_vehicles[int(len(l_unique_vehicles)*0.7):int(len(l_unique_vehicles)*0.8)])]
df_test = df_prep[df_prep.index.isin(l_unique_vehicles[int(len(l_unique_vehicles)*0.8):])]

x_train, y_train = df_train.drop('fd_decision_FG')

print(f"N Total vehicles: {df_prep.index.nunique()}")
print(f"N vehicles in train: {df_train.index.nunique()}")
print(f"N vehicles in test: {df_test.index.nunique()}")
print(f"N vehicles in val: {df_val.index.nunique()}")

print(f"Checkings!!!!")
print(df_prep.index.nunique() == df_train.index.nunique() + df_test.index.nunique() + df_val.index.nunique())
print(set(df_train.index).intersection(set(df_val.index)) == set())
print(set(df_train.index).intersection(set(df_test.index)) == set())
print(set(df_test.index).intersection(set(df_val.index)) == set())

N Total vehicles: 9011
N vehicles in train: 6307
N vehicles in test: 1803
N vehicles in val: 901
Checkings!!!!
True
True
True
True


In [7]:
df_fault = pd.read_excel('../data/raw/fault.xlsx')

In [8]:
df_fault.accident_vehicle_id = df_fault.accident_vehicle_id.astype('str')
df_fault.accident_id = df_fault.accident_id.astype('str')

In [9]:
df_fault

Unnamed: 0,accident_vehicle_id,accident_id,type,case
0,34467,16141,FP,FG
1,34468,16141,FP,FG
2,34521,16169,FP,FG
3,34522,16169,FP,NG
4,34534,16176,FP,FG
...,...,...,...,...
2488,127993,60624,FA,FG
2489,128049,60649,FA,FG
2490,128050,60649,FA,NG
2491,128190,60717,FP,FG


In [14]:
df_fault_g = df_fault.set_index(['accident_vehicle_id', 'accident_id', 'type'])['case'].unstack().reset_index()

In [16]:
df_fault_g

type,accident_vehicle_id,accident_id,FA,FP,FS
0,100003,47319,,NG,
1,100004,47319,,FG,
2,100025,47330,,,NG
3,100026,47330,,,FG
4,100052,47343,,,NG
...,...,...,...,...,...
2488,99851,47248,,FG,
2489,99871,47258,,FG,
2490,99872,47258,,NG,
2491,99932,47288,,FG,


In [25]:
df_fault_g[df_fault_g.accident_id=='16141']

type,accident_vehicle_id,accident_id,FA,FP,FS
807,34467,16141,,FG,
808,34468,16141,,FG,


In [24]:
df[df.accident_id=='16141']

Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id,circumstances_id,road_signs_id,road_surface_id,visibility_id,road_conditions_id
0,VALIDATED_EIF,16141,26245.0,NG,,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,,,1.0,2.0,1.0
1,VALIDATED_EIF,16141,26244.0,FG,First,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,2.0,,1.0,2.0,1.0
2,VALIDATED_EIF,16141,26244.0,FG,Direct traffic,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,12.0,,1.0,2.0,1.0
3,VALIDATED_EIF,16141,26244.0,FG,Green,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,35.0,,1.0,2.0,1.0


In [22]:
df_fault_g.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2493 entries, 0 to 2492
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   accident_vehicle_id  2493 non-null   object
 1   accident_id          2493 non-null   object
 2   FA                   51 non-null     object
 3   FP                   2087 non-null   object
 4   FS                   355 non-null    object
dtypes: object(5)
memory usage: 97.5+ KB


In [29]:
df.vehicle_id = df.vehicle_id.str.replace('.0', '')
df

  df.vehicle_id = df.vehicle_id.str.replace('.0', '')


Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id,circumstances_id,road_signs_id,road_surface_id,visibility_id,road_conditions_id
0,VALIDATED_EIF,16141,26245,NG,,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,,,1.0,2.0,1.0
1,VALIDATED_EIF,16141,26244,FG,First,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,2.0,,1.0,2.0,1.0
2,VALIDATED_EIF,16141,26244,FG,Direct traffic,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,12.0,,1.0,2.0,1.0
3,VALIDATED_EIF,16141,26244,FG,Green,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,35.0,,1.0,2.0,1.0
4,VALIDATED_EIF,16169,26324,NG,First,,Asphalt-concrete,Clear day,Wet,2020-02-01 18:00:00+04:00,141548,2.0,,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18439,VALIDATED_EIF,188892,566531,NG,,,Asphalt-concrete,Clear day,Dry,2021-07-19 01:40:00+04:00,1968163,,,1.0,1.0,1.0
18440,VALIDATED_EIF,188892,566529,FG,Roadside,,Asphalt-concrete,Clear day,Dry,2021-07-19 01:40:00+04:00,1968163,9.0,,1.0,1.0,1.0
18441,VALIDATED_EIF,188892,566529,FG,Beginning of traffic,,Asphalt-concrete,Clear day,Dry,2021-07-19 01:40:00+04:00,1968163,11.0,,1.0,1.0,1.0
18442,VALIDATED_EIF,188892,566529,FG,Backward motion,,Asphalt-concrete,Clear day,Dry,2021-07-19 01:40:00+04:00,1968163,17.0,,1.0,1.0,1.0


In [35]:
df.accident_date = df.accident_date.dt.date

In [36]:
df.merge(df_fault_g, on='accident_id', how='outer').to_excel('../data/processed/three_types_merged.xlsx', index=False)

In [38]:
df_fault_g.accident_id.nunique()

1173

In [None]:
df_fault.

In [39]:
df_fault_g.accident_id.nunique()/df.accident_id.nunique()

0.2781598292625089

In [39]:
from sklearn.tree import DecisionTreeClassifier


X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=42)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Unnamed: 0,accident_id,vehicle_id,accident_date,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,...,road_conditions_Ice,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year
0,16141,26245.0,2020-02-01 01:30:00+04:00,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,2,2020
1,16141,26244.0,2020-02-01 01:30:00+04:00,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
2,16141,26244.0,2020-02-01 01:30:00+04:00,0,0,0,0,0,1,0,...,0,0,0,0,1,0,1,1,2,2020
3,16141,26244.0,2020-02-01 01:30:00+04:00,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,2,2020
4,16169,26324.0,2020-02-01 18:00:00+04:00,0,0,0,0,0,0,0,...,0,0,0,1,0,1,18,1,2,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18439,188892,566531.0,2021-07-19 01:40:00+04:00,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,19,7,2021
18440,188892,566529.0,2021-07-19 01:40:00+04:00,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021
18441,188892,566529.0,2021-07-19 01:40:00+04:00,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021
18442,188892,566529.0,2021-07-19 01:40:00+04:00,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,19,7,2021


In [32]:
df_prep_dummy.compensation_status.value_counts()

VALIDATED_EIF    18444
Name: compensation_status, dtype: int64