## Data Preparation stage

##### Join raw accident data with fault DB, update parameters

In [1]:
!export PYTHONPATH=$PYTHONPATH:~/src

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
# from src.data.data_prep import convert_cols_type

In [3]:
def convert_cols_type(df: pd.DataFrame, features_l: list, type_: str):
    for feature in features_l:
        df[feature] = df[feature].astype(type_)
    return df

In [4]:
cat_features = [
    'circumstances',
    'road_signs',
    'road_surface',
    'visibility',
    'road_conditions'
]

In [5]:
df_fil = pd.read_csv('../data/interim/data_bivac_filtered.csv')
print(f"Data shape: {df_fil.shape}")
print(f"N accidents after filtering: {df_fil.accident_id.nunique()}")

Data shape: (18444, 11)
N accidents after filtering: 4217


In [6]:
df_fau = pd.read_excel('../data/raw/fault with param.xlsx')
print(f"Data shape: {df_fau.shape}")
print(f"N accidents after in Fault DB: {df_fau.accident_id.nunique()}")

Data shape: (6252, 9)
N accidents after in Fault DB: 1173


In [7]:
cat_features_fil = [
    'accident_id', 
    'vehicle_id', 
    'circumstances', 
    'road_signs',
    'road_surface', 
    'visibility', 
    'road_conditions',
    'address_id'
    ]

df_fil.accident_date = pd.to_datetime(df_fil.accident_date)

df_fil = convert_cols_type(
    df=df_fil,
    features_l=cat_features_fil,
    type_='str'
)

In [8]:
cat_features_fau = [
    'accident_id', 
    'accident_vehicle_id', 
    'circumstances', 
    'road_signs',
    'road_surface', 
    'visibility', 
    'road_conditions'
    ]

df_fau = convert_cols_type(
    df=df_fau,
    features_l=cat_features_fau,
    type_='str'
)

In [9]:
print(f"{df_fau.accident_id.nunique()} accidents in Fault DB")
print(f"All accidents from Fault DB are in our filtered data: {df_fil[df_fil.accident_id.isin(df_fau.accident_id.unique())].accident_id.unique() in df_fau.accident_id.unique()}")

1173 accidents in Fault DB
All accidents from Fault DB are in our filtered data: True


In [10]:
df_fau.type.value_counts()

FP    5246
FS     868
FA     138
Name: type, dtype: int64

In [11]:
df_fau.columns

Index(['accident_vehicle_id', 'accident_id', 'type', 'case', 'circumstances',
       'road_signs', 'road_surface', 'visibility', 'road_conditions'],
      dtype='object')

In [12]:
df_fau[df_fau.accident_id=="16141"]

Unnamed: 0,accident_vehicle_id,accident_id,type,case,circumstances,road_signs,road_surface,visibility,road_conditions
0,34467,16141,FP,FG,2.0,,,,
1,34467,16141,FP,FG,12.0,,,,
2,34468,16141,FP,FG,2.0,,,,
3,34468,16141,FP,FG,12.0,,,,


In [13]:
df_fil[df_fil.accident_id=="16141"]

Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id
0,VALIDATED_EIF,16141,26245.0,NG,,,1.0,2.0,1.0,2020-02-01 01:30:00+04:00,141147
1,VALIDATED_EIF,16141,26244.0,FG,2.0,,1.0,2.0,1.0,2020-02-01 01:30:00+04:00,141147
2,VALIDATED_EIF,16141,26244.0,FG,12.0,,1.0,2.0,1.0,2020-02-01 01:30:00+04:00,141147
3,VALIDATED_EIF,16141,26244.0,FG,35.0,,1.0,2.0,1.0,2020-02-01 01:30:00+04:00,141147


In [14]:
nan_features_vehicles = []
for vehicle in tqdm(df_fau.accident_vehicle_id.unique()):
    if (df_fau[df_fau.accident_vehicle_id==vehicle][['road_surface', 'visibility', 'road_conditions']]=='nan').all().all():
        nan_features_vehicles.append(vehicle)
        
nan_features_accidents = list(df_fau[df_fau.accident_vehicle_id.isin(nan_features_vehicles)].accident_id.unique())

100%|█████████████████████████████████████| 2493/2493 [00:02<00:00, 1166.24it/s]


In [15]:
df_fil[df_fil.accident_id.isin(nan_features_accidents)][['road_signs', 'road_surface', 'visibility', 'road_conditions']]

Unnamed: 0,road_signs,road_surface,visibility,road_conditions
0,,1.0,2.0,1.0
1,,1.0,2.0,1.0
2,,1.0,2.0,1.0
3,,1.0,2.0,1.0
12,,1.0,1.0,1.0
...,...,...,...,...
6718,,1.0,1.0,1.0
6719,,1.0,1.0,1.0
6720,,1.0,1.0,1.0
6721,,1.0,1.0,1.0


In [16]:
print(f'We have {len(nan_features_accidents)} accidents that dont have features info in fault db but have in initial table')

We have 854 accidents that dont have features info in fault db but have in initial table


In [17]:
len(nan_features_accidents)

854

In [18]:
# I'm filterinf out those accidnets

df_fau = df_fau[~df_fau.accident_id.isin(nan_features_accidents)]

print(f"{df_fau.accident_id.nunique()} accidents from Fault DB are being considered")

319 accidents from Fault DB are being considered


In [19]:
# getting the last fg-ng status per vehicle from fault db

df_fau_p = df_fau.pivot_table(['case',
                                ], ['accident_vehicle_id', 'accident_id', 'circumstances',
                                'road_signs',
                                'road_surface',
                                'visibility',
                                'road_conditions'], 'type', aggfunc='first').reset_index()
df_fau_p.columns = df_fau_p.columns.map(''.join)
df_fau_p.caseFA = df_fau_p.caseFA.fillna(df_fau_p.caseFP)
df_fau_p.caseFS = df_fau_p.caseFS.fillna(df_fau_p.caseFA)

df_fau_p['fd_decision'] = df_fau_p.caseFS

df_fau_p

Unnamed: 0,accident_vehicle_id,accident_id,circumstances,road_signs,road_surface,visibility,road_conditions,caseFA,caseFP,caseFS,fd_decision
0,100569,47590,12.0,,1.0,2.0,1.0,FG,FG,FG,FG
1,100569,47590,15.0,,1.0,2.0,1.0,FG,FG,FG,FG
2,100569,47590,2.0,,1.0,2.0,1.0,FG,FG,FG,FG
3,100569,47590,21.0,,1.0,2.0,1.0,FG,FG,FG,FG
4,100569,47590,25.0,,1.0,2.0,1.0,FG,FG,FG,FG
...,...,...,...,...,...,...,...,...,...,...,...
2126,99933,47288,35.0,163.0,1.0,1.0,1.0,NG,NG,NG,NG
2127,99933,47288,35.0,31.0,1.0,1.0,1.0,NG,NG,NG,NG
2128,99933,47288,4.0,13.0,1.0,1.0,1.0,NG,NG,NG,NG
2129,99933,47288,4.0,163.0,1.0,1.0,1.0,NG,NG,NG,NG


In [20]:
print(f"There are {df_fau_p[pd.isna(df_fau_p.caseFS)].accident_id.nunique()} accidents in Fault DB with unknown FG/NG status")

There are 0 accidents in Fault DB with unknown FG/NG status


In [21]:
df_faulted_accidents = df_fil[df_fil.accident_id.isin(df_fau_p.accident_id.unique())]
dict_acc_date = dict(zip(df_faulted_accidents.accident_id, df_faulted_accidents.accident_date))

df_fau_p['accident_date'] = df_fau_p.accident_id.map(dict_acc_date)
df_fau_p = df_fau_p.rename(columns={'accident_vehicle_id': 'vehicle_id'})

df = pd.concat([
    df_fil[~df_fil.accident_id.isin(df_fau_p.accident_id.unique())], 
    df_fau_p[['accident_id', 'vehicle_id', 'fd_decision',
              'circumstances', 'road_signs', 'road_surface', 'visibility',
              'road_conditions', 'accident_date']]])

df = df.sort_values(by=['accident_date', 'accident_id'])
df

Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id
11123,VALIDATED_EIF,84752,138906.0,FG,2.0,,5.0,1.0,1.0,2018-01-11 08:10:00+04:00,754074
11124,VALIDATED_EIF,84752,138906.0,FG,12.0,,5.0,1.0,1.0,2018-01-11 08:10:00+04:00,754074
13205,VALIDATED_EIF,101310,168010.0,FG,4.0,,1.0,1.0,1.0,2018-06-21 12:30:00+04:00,910393
13206,VALIDATED_EIF,101310,168010.0,FG,13.0,,1.0,1.0,1.0,2018-06-21 12:30:00+04:00,910393
13207,VALIDATED_EIF,101310,168010.0,FG,25.0,,1.0,1.0,1.0,2018-06-21 12:30:00+04:00,910393
...,...,...,...,...,...,...,...,...,...,...,...
18439,VALIDATED_EIF,188892,566531.0,NG,,,1.0,1.0,1.0,2021-07-19 01:40:00+04:00,1968163
18440,VALIDATED_EIF,188892,566529.0,FG,9.0,,1.0,1.0,1.0,2021-07-19 01:40:00+04:00,1968163
18441,VALIDATED_EIF,188892,566529.0,FG,11.0,,1.0,1.0,1.0,2021-07-19 01:40:00+04:00,1968163
18442,VALIDATED_EIF,188892,566529.0,FG,17.0,,1.0,1.0,1.0,2021-07-19 01:40:00+04:00,1968163


In [22]:
df.accident_id.value_counts()

16310    28
41262    26
39311    24
32270    24
59635    24
         ..
93689     2
92843     2
83297     2
74967     2
84752     2
Name: accident_id, Length: 4217, dtype: int64

In [23]:
df[df.accident_id=='59635'].accident_date.nunique()

# so there's no accident id for diffent dates

1

In [24]:
df.fd_decision.isna().any()

# there's no unknown fd decision

False

In [25]:
df.shape

(19091, 11)

In [26]:
from random import randint

def random_with_N_digits(n):
    range_start = 10**(n-1)
    range_end = (10**n)-1
    return randint(range_start, range_end)

In [27]:
for accident in tqdm(df.accident_id.unique()):
    for vehicle in df[df.accident_id==accident].vehicle_id.unique():
        current_v_id = df[(df.accident_id==accident) & (df.vehicle_id==vehicle)].vehicle_id.unique()[0]
        new_v_id = f"v_{random_with_N_digits(6)}"
        df.loc[(df.accident_id==accident) & 
               (df.vehicle_id==vehicle), 'vehicle_id'] = new_v_id

100%|███████████████████████████████████████| 4217/4217 [00:43<00:00, 96.73it/s]


In [28]:
df

Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id
11123,VALIDATED_EIF,84752,v_496172,FG,2.0,,5.0,1.0,1.0,2018-01-11 08:10:00+04:00,754074
11124,VALIDATED_EIF,84752,v_496172,FG,12.0,,5.0,1.0,1.0,2018-01-11 08:10:00+04:00,754074
13205,VALIDATED_EIF,101310,v_479807,FG,4.0,,1.0,1.0,1.0,2018-06-21 12:30:00+04:00,910393
13206,VALIDATED_EIF,101310,v_479807,FG,13.0,,1.0,1.0,1.0,2018-06-21 12:30:00+04:00,910393
13207,VALIDATED_EIF,101310,v_479807,FG,25.0,,1.0,1.0,1.0,2018-06-21 12:30:00+04:00,910393
...,...,...,...,...,...,...,...,...,...,...,...
18439,VALIDATED_EIF,188892,v_295944,NG,,,1.0,1.0,1.0,2021-07-19 01:40:00+04:00,1968163
18440,VALIDATED_EIF,188892,v_307092,FG,9.0,,1.0,1.0,1.0,2021-07-19 01:40:00+04:00,1968163
18441,VALIDATED_EIF,188892,v_307092,FG,11.0,,1.0,1.0,1.0,2021-07-19 01:40:00+04:00,1968163
18442,VALIDATED_EIF,188892,v_307092,FG,17.0,,1.0,1.0,1.0,2021-07-19 01:40:00+04:00,1968163


### Get categories names

In [29]:
l_category_decoding = []
for sheet_name in tqdm(cat_features):
    df_temp = pd.read_excel('../data/raw/data with translations.xlsx', sheet_name=sheet_name)
    dict_temp = dict(zip(df_temp.iloc[:, 0].astype(str) + '.0', df_temp.iloc[:, 1]))
    df[f"{sheet_name}_id"] = df[sheet_name]
    df[sheet_name] = df[f"{sheet_name}_id"].map(dict_temp)

    l_category_decoding.append({sheet_name: dict_temp})
    
print(df.shape)
df.head()

100%|█████████████████████████████████████████████| 5/5 [00:06<00:00,  1.28s/it]

(19091, 16)





Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id,circumstances_id,road_signs_id,road_surface_id,visibility_id,road_conditions_id
11123,VALIDATED_EIF,84752,v_496172,FG,First,,Other,Clear day,Dry,2018-01-11 08:10:00+04:00,754074,2.0,,5.0,1.0,1.0
11124,VALIDATED_EIF,84752,v_496172,FG,Direct traffic,,Other,Clear day,Dry,2018-01-11 08:10:00+04:00,754074,12.0,,5.0,1.0,1.0
13205,VALIDATED_EIF,101310,v_479807,FG,Third,,Asphalt-concrete,Clear day,Dry,2018-06-21 12:30:00+04:00,910393,4.0,,1.0,1.0,1.0
13206,VALIDATED_EIF,101310,v_479807,FG,Rearrangement to right,,Asphalt-concrete,Clear day,Dry,2018-06-21 12:30:00+04:00,910393,13.0,,1.0,1.0,1.0
13207,VALIDATED_EIF,101310,v_479807,FG,Marker lights,,Asphalt-concrete,Clear day,Dry,2018-06-21 12:30:00+04:00,910393,25.0,,1.0,1.0,1.0


In [30]:
import json
with open('../data/processed/feature_decoding.json', 'w') as f:
    json.dump(l_category_decoding, f)

### Feature Engineering

In [31]:
num_cardinality = 0
dummy_col_names = []
for feature in cat_features:
    l_unique_vals = df[feature].unique()
    n_unique_vals = len(l_unique_vals)
    l_range_nunique = [i for i in range(n_unique_vals)] # noqa

    num_cardinality += n_unique_vals
    l_ind_value_name = [f"{feature}_{i}" for i in l_unique_vals]
    dummy_col_names = dummy_col_names + l_ind_value_name


print(f"Cardinality: {num_cardinality}")

Cardinality: 75


In [32]:
# One hot encoding
df_prep = pd.get_dummies(df, columns=cat_features + ['fd_decision'])

# daytime features
df_prep['hour'] = df_prep.accident_date.dt.hour
df_prep['day_of_month'] = df_prep.accident_date.dt.day
df_prep['month'] = df_prep.accident_date.dt.month
df_prep['year'] = df_prep.accident_date.dt.year

# accident level features
df_temp = df[['accident_id', 'vehicle_id']].groupby('accident_id').agg({"vehicle_id": ["nunique"]}).reset_index()
df_temp.columns = ["".join(col).strip().replace("nunique", "") for col in df_temp.columns.values]
dict_acc_veh_count = dict(zip(df_temp.accident_id, df_temp.vehicle_id))

df_prep['n_vehicles_left_in_accident'] = df_prep.accident_id.map(dict_acc_veh_count)

cols_to_drop = [f"{col}_id" for col in cat_features] + \
               [
    'compensation_status',
    'address_id',
    'accident_date',
    'accident_id'
               ]


df_prep = df_prep.drop(cols_to_drop, axis=1).set_index('vehicle_id')
print(df_prep.shape)
df_prep.head()

(19091, 78)


Unnamed: 0_level_0,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,circumstances_First,circumstances_Fog lights,circumstances_Fourth,...,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year,n_vehicles_left_in_accident
vehicle_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
v_496172,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,8,11,1,2018,1
v_496172,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,8,11,1,2018,1
v_479807,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,12,21,6,2018,2
v_479807,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,12,21,6,2018,2
v_479807,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,12,21,6,2018,2


In [33]:
df_prep.index.value_counts()

v_321308    22
v_687097    22
v_793117    22
v_101134    20
v_738428    20
            ..
v_229638     1
v_306943     1
v_966545     1
v_863759     1
v_564705     1
Name: vehicle_id, Length: 8968, dtype: int64

In [34]:
df_prep[df_prep.index=='v_866230']

Unnamed: 0_level_0,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,circumstances_First,circumstances_Fog lights,circumstances_Fourth,...,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year,n_vehicles_left_in_accident
vehicle_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [35]:
df_prep.to_csv('../data/processed/data_prepared.csv')

### Checkings

In [42]:
df_fil[df_fil.vehicle_id=='1396']

Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id


In [60]:
df_fil[df_fil.vehicle_id=='7']

Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id
461,VALIDATED_EIF,18633,7,FG,3.0,,1,2,1,2020-02-09 21:20:00+04:00,1046
462,VALIDATED_EIF,18633,7,FG,11.0,,1,2,1,2020-02-09 21:20:00+04:00,1046
463,VALIDATED_EIF,18633,7,FG,31.0,,1,2,1,2020-02-09 21:20:00+04:00,1046
548,VALIDATED_EIF,19128,7,FG,2.0,,1,1,1,2020-02-11 12:25:00+04:00,1681
549,VALIDATED_EIF,19128,7,FG,12.0,,1,1,1,2020-02-11 12:25:00+04:00,1681
550,VALIDATED_EIF,19128,7,FG,,,1,1,1,2020-02-11 12:25:00+04:00,1681
1606,VALIDATED_EIF,27628,7,FG,3.0,,1,2,1,2020-03-01 23:30:00+04:00,218443
1607,VALIDATED_EIF,27628,7,FG,12.0,,1,2,1,2020-03-01 23:30:00+04:00,218443
1615,VALIDATED_EIF,27639,7,FG,2.0,,1,1,1,2020-03-02 15:45:00+04:00,218648
1616,VALIDATED_EIF,27639,7,FG,15.0,,1,1,1,2020-03-02 15:45:00+04:00,218648


In [57]:
df_fil[df_fil.index=='1396']

Unnamed: 0_level_0,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,circumstances_First,circumstances_Fog lights,circumstances_Fourth,...,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year,n_vehicles_left_in_accident
vehicle_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1396,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,20,13,9,2020,2
1396,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,20,13,9,2020,2
1396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,20,13,9,2020,2
1396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,14,30,8,2020,2
1396,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,13,27,9,2020,7
1396,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,13,27,9,2020,7


In [61]:
df_fil

Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id
0,VALIDATED_EIF,16141,26245,NG,,,1,2,1,2020-02-01 01:30:00+04:00,141147
1,VALIDATED_EIF,16141,26244,FG,2,,1,2,1,2020-02-01 01:30:00+04:00,141147
2,VALIDATED_EIF,16141,26244,FG,12,,1,2,1,2020-02-01 01:30:00+04:00,141147
3,VALIDATED_EIF,16141,26244,FG,35,,1,2,1,2020-02-01 01:30:00+04:00,141147
4,VALIDATED_EIF,16169,26324,NG,2,,1,1,2,2020-02-01 18:00:00+04:00,141548
...,...,...,...,...,...,...,...,...,...,...,...
18439,VALIDATED_EIF,188892,566531,NG,,,1,1,1,2021-07-19 01:40:00+04:00,1968163
18440,VALIDATED_EIF,188892,566529,FG,9,,1,1,1,2021-07-19 01:40:00+04:00,1968163
18441,VALIDATED_EIF,188892,566529,FG,11,,1,1,1,2021-07-19 01:40:00+04:00,1968163
18442,VALIDATED_EIF,188892,566529,FG,17,,1,1,1,2021-07-19 01:40:00+04:00,1968163
