In [1]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import requests
import os
import shutil
import folium
import shapely
import matplotlib
from ipywidgets import interact
import seaborn as sns
import math
import stats
import scipy
import sklearn 

# Preprocessing 

## database n.1 : ROAD RISK ~ Shape file

The first database we are cleaning is a shape file containing segments of road in Porto and Lisbon. 
In the next steps we are preparing the dataset for the future analysis.

### Extracting Data

In [4]:
# link = 'https://wdl-data.fra1.digitaloceanspaces.com/pse/m_risk_prfile.zip'
# s = requests.get(link).content

In [11]:
def first_df(path):
    geo = gpd.read_file(path)
    df = pd.DataFrame(geo).drop(columns='Link_ID')
    df_ren = df.rename(columns={
                        'Daily_Aver':'Daily_Average_Traffic_Intensity',
                        'Average_Ve':'Average_Velocity_of_Vehicle_Traffic',
                        'Median_of_':'Median_of_velocity_of_Vehicle_Traffic',
                        'First_Quar': 'FirstQuartil_of_velocity_of_Vehicle_Traffic',
                        'Third_Quar': 'ThirdQuartil_of_velocity_of_Vehicle_Traffic'
                    })
    return df_ren
    
df = first_df('wdl_data/m_risk_prfile.geojson') 

- Now we have a dataset containing the same columns but renamed. 
- We also drop the 'Link_ID' column as contain the unique id of the streat, information already present in linkid column

To have a better understanding on our data and to avoid errors during our analysis we need to investigate it with general statistics

### Remove outliers: 

In [12]:
df.describe()

Unnamed: 0,linkid,Daily_Average_Traffic_Intensity,Average_Velocity_of_Vehicle_Traffic,Median_of_velocity_of_Vehicle_Traffic,FirstQuartil_of_velocity_of_Vehicle_Traffic,ThirdQuartil_of_velocity_of_Vehicle_Traffic,Func_Class,Speed_Cat
count,34678.0,34678.0,34678.0,34678.0,34678.0,34678.0,34678.0,34678.0
mean,895820600.0,3340.417942,56.816834,56.463409,43.822041,68.091844,2.684613,4.904781
std,235591000.0,2725.873982,51.98367,26.240876,24.442204,30.985191,0.538658,1.520568
min,80216820.0,14.435864,-401.703724,1.0,-392.5,1.0,1.0,2.0
25%,736483200.0,1903.398108,38.315321,38.25,26.0,48.0,2.0,4.0
50%,906737700.0,2644.529317,49.966126,50.0,38.875,60.333333,3.0,6.0
75%,1154997000.0,3897.886608,69.511585,71.0,56.0,85.0,3.0,6.0
max,1223731000.0,49309.806935,6357.022296,1326.25,143.0,2605.0,3.0,7.0


- Regarding the columns we know that they report values in km/h: many of the min and max we can observe thank to describe function don't make sense. 
- We need to operate on them as they are **outliers**

In [13]:
def rm_out(df):
    for i in df.columns.drop(['linkid', 'Daily_Average_Traffic_Intensity','geometry']):
        lb = 0
        ub = 180
#         print(lb, ub)
        df[i] = df[i].mask(df[i] < lb) 
        df[i] = df[i].mask(df[i] > ub) 
    return df

data = rm_out(df)

**If we run describe again we will see that the data regarding velocity has just feasible values.**

In [15]:
data.describe()

Unnamed: 0,linkid,Daily_Average_Traffic_Intensity,Average_Velocity_of_Vehicle_Traffic,Median_of_velocity_of_Vehicle_Traffic,FirstQuartil_of_velocity_of_Vehicle_Traffic,ThirdQuartil_of_velocity_of_Vehicle_Traffic,Func_Class,Speed_Cat
count,34678.0,34678.0,34633.0,34675.0,34677.0,34674.0,34678.0,34678.0
mean,895820600.0,3340.417942,56.112805,56.402803,43.834624,67.959767,2.684613,4.904781
std,235591000.0,2725.873982,24.346245,25.054113,24.329987,26.706718,0.538658,1.520568
min,80216820.0,14.435864,1.0,1.0,0.0,1.0,1.0,2.0
25%,736483200.0,1903.398108,38.317003,38.25,26.0,48.0,2.0,4.0
50%,906737700.0,2644.529317,49.961538,50.0,38.875,60.333333,3.0,6.0
75%,1154997000.0,3897.886608,69.447459,71.0,56.0,85.0,3.0,6.0
max,1223731000.0,49309.806935,179.691892,143.25,143.0,164.0,3.0,7.0


### Feature creation:

Now looking at our data we need to search for a target that in the next step we will use in our model.
Most common causes of Accidents:
- Over Speeding.
- Drunken Driving.
- Distractions to Driver.
- Red Light Jumping.
- Avoiding Safety Gears like Seat belts and Helmets.
- Non-adherence to lane driving and overtaking in a wrong manner.

The first cause is always the **over-speed** that can be connected with one of the above causes. 
For this reason we decide to investigate and use as target information regarding the velocity.

- Speed_Cat (described in the excel above)
- Average Velocity of Vehicle Traffic 
- Median of velocity of Vehicle Traffic

We will create a dictionary that, from the information contained in the excel can describe the type of street regarding the max velocity allowed in there. 


In [17]:
speed_explanation = pd.read_excel('wdl_dict/Dictionary_Risk_Profiles.xlsx', sheet_name='SpeedCat')
speed_explanation

Unnamed: 0,Speed Cat,Speed range in km/h
0,1,>130 km/h
1,2,101-130 km/h
2,3,91-100 km/h
3,4,71-90 km/h
4,5,51-70 km/h
5,6,31-50 km/h
6,7,11-30 km/h
7,8,<11 km/h


 from the table above we can create a dictionary.
1. count values for category
2. translate the speed range in actual number

In [18]:
data.Speed_Cat.value_counts()

6    20312
4     5273
2     5123
3     1843
5     1770
7      357
Name: Speed_Cat, dtype: int64

**NO need of mapping for label 1 and 8**

In [19]:
max_speed_dict = {2:130,3:100,4:90,5:70,6:50,7:30}

In [21]:
def target_creation(data):
    data['Max_speed'] = data['Speed_Cat'].map(max_speed_dict)
    data['Speed_Diff_Mean'] = data['Max_speed'] - data['Average_Velocity_of_Vehicle_Traffic']
    data['Speed_Diff_Median'] = data['Max_speed'] - data['Median_of_velocity_of_Vehicle_Traffic']
    return data
data = target_creation(data)

In [23]:
data[['Speed_Cat','Max_speed', 'Speed_Diff_Mean','Speed_Diff_Median']].head(10)

Unnamed: 0,Speed_Cat,Max_speed,Speed_Diff_Mean,Speed_Diff_Median
0,6,50,4.791284,5.535714
1,4,90,10.436692,7.25
2,6,50,-15.955069,-17.333333
3,6,50,2.466089,6.0
4,6,50,-1.375291,-3.25
5,6,50,17.667821,18.333333
6,6,50,19.613591,22.9375
7,6,50,26.714286,39.0
8,6,50,6.476409,4.0
9,6,50,-11.897045,1.346154


**our first target will be the difference between the mean of velocity and the max speed**

### Handling duplicates:

In [24]:
len(data) == len(data.drop_duplicates())

True

There are ***no duplicates*** in our dataset

### Handling missing values: