## Data Preparation stage

##### Join raw accident data with fault DB, update parameters

In [1]:
import pandas as pd
from tqdm import tqdm
from src.data.data_prep import convert_cols_type

In [2]:
df_fil = pd.read_csv('../data/interim/data_bivac_filtered.csv')
print(f"Data shape: {df_fil.shape}")
print(f"N accidents after filtering: {df_fil.accident_id.nunique()}")

Data shape: (18444, 11)
N accidents after filtering: 4217


In [3]:
df_fau = pd.read_excel('../data/raw/fault with param.xlsx')
print(f"Data shape: {df_fau.shape}")
print(f"N accidents after in Fault DB: {df_fau.accident_id.nunique()}")

Data shape: (6252, 9)
N accidents after in Fault DB: 1173


In [5]:
cat_features_fil = [
    'accident_id', 
    'vehicle_id', 
    'circumstances', 
    'road_signs',
    'road_surface', 
    'visibility', 
    'road_conditions',
    'address_id'
    ]

df_fil.accident_date = pd.to_datetime(df_fil.accident_date)
df_fil = convert_cols_type(
    df=df_fil,
    features_l=cat_features_fil,
    type_='str'
)

In [8]:
cat_features_fau = [
    'accident_id', 
    'accident_vehicle_id', 
    'circumstances', 
    'road_signs',
    'road_surface', 
    'visibility', 
    'road_conditions'
    ]

df_fau = convert_cols_type(
    df=df_fau,
    features_l=cat_features_fau,
    type_='str'
)

In [9]:
print(f"{df_fau.accident_id.nunique()} accidents in Fault DB")
print(f"All accidents from Fault DB are in our filtered data: {df_fil[df_fil.accident_id.isin(df_fau.accident_id.unique())].accident_id.unique() in df_fau.accident_id.unique()}")

1060 accidents in Fault DB
All accidents from Fault DB are in our filtered data: True


In [10]:
df_fau.type.value_counts()

FP    5246
FS     868
FA     138
Name: type, dtype: int64

In [11]:
df_fau.columns

Index(['accident_vehicle_id', 'accident_id', 'type', 'case', 'circumstances',
       'road_signs', 'road_surface', 'visibility', 'road_conditions'],
      dtype='object')

In [12]:
df_fau.pivot_table(['case',
                    ], ['accident_vehicle_id', 'accident_id', 'circumstances',
                    'road_signs',
                    'road_surface',
                    'visibility',
                    'road_conditions'], 'type', aggfunc='first')#.reset_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,case,case,case
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,type,FA,FP,FS
accident_vehicle_id,accident_id,circumstances,road_signs,road_surface,visibility,road_conditions,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
,51632,12,,1,1,1,,NG,
,51632,3,,1,1,1,,NG,
,51642,12,,1,1,1,,FG,
,51642,4,,1,1,1,,FG,
,51651,12,,,,,,NG,
...,...,...,...,...,...,...,...,...,...
99933,47288,35,163,1,1,1,,NG,
99933,47288,35,31,1,1,1,,NG,
99933,47288,4,13,1,1,1,,NG,
99933,47288,4,163,1,1,1,,NG,


In [13]:
df_fau[df_fau.accident_id=='47319']

Unnamed: 0,accident_vehicle_id,accident_id,type,case,circumstances,road_signs,road_surface,visibility,road_conditions
3787,3,47319,FP,NG,3,,,,
3788,3,47319,FP,NG,12,,,,
3789,4,47319,FP,FG,2,,,,
3790,4,47319,FP,FG,11,,,,


In [14]:
df_fil[df_fil.accident_id=='47319']

Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id
4713,VALIDATED_EIF,47319,74265,NG,3.0,,1,1,1,2020-05-31 12:11:00+04:00,6146
4714,VALIDATED_EIF,47319,74265,NG,12.0,,1,1,1,2020-05-31 12:11:00+04:00,6146
4715,VALIDATED_EIF,47319,74265,NG,25.0,,1,1,1,2020-05-31 12:11:00+04:00,6146
4716,VALIDATED_EIF,47319,74265,NG,,,1,1,1,2020-05-31 12:11:00+04:00,6146
4717,VALIDATED_EIF,47319,74263,FG,4.0,,1,1,1,2020-05-31 12:11:00+04:00,6146
4718,VALIDATED_EIF,47319,74263,FG,12.0,,1,1,1,2020-05-31 12:11:00+04:00,6146
4719,VALIDATED_EIF,47319,74263,FG,19.0,,1,1,1,2020-05-31 12:11:00+04:00,6146
4720,VALIDATED_EIF,47319,74263,FG,35.0,,1,1,1,2020-05-31 12:11:00+04:00,6146


In [15]:
df_fau

Unnamed: 0,accident_vehicle_id,accident_id,type,case,circumstances,road_signs,road_surface,visibility,road_conditions
0,34467,16141,FP,FG,2,,,,
1,34467,16141,FP,FG,12,,,,
2,34468,16141,FP,FG,2,,,,
3,34468,16141,FP,FG,12,,,,
4,34521,16169,FP,FG,2,,1,1,1
...,...,...,...,...,...,...,...,...,...
6247,1281,717,FP,FG,3,269,1,1,1
6248,1281,717,FP,FG,12,269,1,1,1
6249,1281,717,FP,FG,13,269,1,1,1
6250,128191,717,FP,NG,2,269,1,1,1


In [16]:
df = df_fil.copy()
# in the future df should be the dataframe with updated features from fault db and with the FINAL decision on fd_status

### Get categories names

In [17]:
cat_features = [
    'circumstances',
    'road_signs',
    'road_surface',
    'visibility',
    'road_conditions'
]

In [18]:
l_category_decoding = []
for sheet_name in tqdm(cat_features):
    df_temp = pd.read_excel('../data/raw/data with translations.xlsx', sheet_name=sheet_name)
    dict_temp = dict(zip(df_temp.iloc[:, 0].astype(str), df_temp.iloc[:, 1]))
    df[f"{sheet_name}_id"] = df[sheet_name]
    df[sheet_name] = df[f"{sheet_name}_id"].map(dict_temp)

    l_category_decoding.append({sheet_name: dict_temp})
    
print(df.shape)
df.head()

100%|██████████| 5/5 [00:09<00:00,  1.99s/it]

(18444, 16)





Unnamed: 0,compensation_status,accident_id,vehicle_id,fd_decision,circumstances,road_signs,road_surface,visibility,road_conditions,accident_date,address_id,circumstances_id,road_signs_id,road_surface_id,visibility_id,road_conditions_id
0,VALIDATED_EIF,16141,26245,NG,,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,,,1,2,1
1,VALIDATED_EIF,16141,26244,FG,First,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,2.0,,1,2,1
2,VALIDATED_EIF,16141,26244,FG,Direct traffic,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,12.0,,1,2,1
3,VALIDATED_EIF,16141,26244,FG,Green,,Asphalt-concrete,Clear night,Dry,2020-02-01 01:30:00+04:00,141147,35.0,,1,2,1
4,VALIDATED_EIF,16169,26324,NG,First,,Asphalt-concrete,Clear day,Wet,2020-02-01 18:00:00+04:00,141548,2.0,,1,1,2


In [19]:
import json
with open('../data/processed/feature_decoding.json', 'w') as f:
    json.dump(l_category_decoding, f)

### Feature Engineering

In [20]:
num_cardinality = 0
dummy_col_names = []
for feature in cat_features:
    l_unique_vals = df[feature].unique()
    n_unique_vals = len(l_unique_vals)
    l_range_nunique = [i for i in range(n_unique_vals)] # noqa

    num_cardinality += n_unique_vals
    l_ind_value_name = [f"{feature}_{i}" for i in l_unique_vals]
    dummy_col_names = dummy_col_names + l_ind_value_name


print(f"Cardinality: {num_cardinality}")

Cardinality: 63


In [36]:
# One hot encoding
df_prep = pd.get_dummies(df, columns=cat_features + ['fd_decision'])

# daytime features
df_prep['hour'] = df_prep.accident_date.dt.hour
df_prep['day_of_month'] = df_prep.accident_date.dt.day
df_prep['month'] = df_prep.accident_date.dt.month
df_prep['year'] = df_prep.accident_date.dt.year

# accident level features
df_temp = df[['accident_id', 'vehicle_id']].groupby('accident_id').agg({"vehicle_id": ["nunique"]}).reset_index()
df_temp.columns = ["".join(col).strip().replace("nunique", "") for col in df_temp.columns.values]
dict_acc_veh_count = dict(zip(df_temp.accident_id, df_temp.vehicle_id))

df_prep['n_vehicles_left_in_accident'] = df_prep.accident_id.map(dict_acc_veh_count)

cols_to_drop = [f"{col}_id" for col in cat_features] + \
               [
    'compensation_status',
    'address_id',
    'accident_date',
    'accident_id'
               ]


df_prep = df_prep.drop(cols_to_drop, axis=1).set_index('vehicle_id')
print(df_prep.shape)
df_prep.head()

(18444, 68)


Unnamed: 0_level_0,circumstances_Alarm signal,circumstances_Backward motion,circumstances_Beginning of traffic,circumstances_Breaking,circumstances_Detour,circumstances_Direct traffic,circumstances_Fifth,circumstances_First,circumstances_Fog lights,circumstances_Fourth,...,road_conditions_Other,road_conditions_Snow,road_conditions_Wet,fd_decision_FG,fd_decision_NG,hour,day_of_month,month,year,n_vehicles_left_in_accident
vehicle_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,2,2020,2
26244,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,1,2,2020,2
26244,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,1,1,2,2020,2
26244,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,2,2020,2
26324,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,1,18,1,2,2020,2


In [38]:
df_prep.to_csv('../data/processed/data_prepared.csv')