# Notebook for linking a searched path to a risk

We would like to find the risk of going from a point A to a point B in Geneva. This notebook will implement this idea!

In [117]:
# import libraries
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
import copy
import csv

# for string processing
import unidecode 
import collections
import string

# Merging names. Installation: pip install fuzzywuzzy and pip install fuzzywuzzy[speedup]
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from IPython.core import display as ICD

# for string processing
import re

%matplotlib inline 

## 1. Setting things up:

In [119]:
# GO and fetch the google API keys from a local file
CENTRAL_FOLDER = '../'
API_keys = pd.read_csv(CENTRAL_FOLDER + 'API_google_key',index_col=0,header=None, sep='\=', engine='python')

#############   Google API KEYS ##################
nearest_roads_API_KEY = API_keys.iloc[0].values[0] # accepts up to 100 points
placesID_API_KEY = API_keys.iloc[1].values[0]
API_KEY = API_keys.iloc[2].values[0] # google directions

# load risk df:
FINAL_FOLDER = '../final_csv/'
risk_df = pd.read_csv(FINAL_FOLDER + 'risk.csv',index_col=0, sep='\t')
risk_df.head(3)

Unnamed: 0,Risk_NB_ACC,Risk_NB_PIETONS,Risk_NB_VOITURES_TOURISME,Risk_MOTOS,Risk_BIKES,Risk_INJURY,Risk_GRAVE_INJURY,Risk_PUBLIC_TRANSP,Risk_DRIVEN_BY_EMPLOYEES
avenue de france,1.677023,1.659233,1.775024,1.713112,1.516683,1.775024,1.155005,1.910585,3.473321
avenue de la paix,2.34291,1.0,2.806056,1.0,2.986662,2.103701,1.397332,3.334152,2.585001
avenue de thonex,2.116569,2.060669,1.906888,3.294716,1.415657,1.692761,1.249394,2.46508,3.984578


### 1.1 Functions:

In [138]:
# function to make requests
def request_get_json(url):
    """make a request to the specified url"""
    r = requests.get(url)
    if r.status_code != 200:
        print (r.status_code)
    page_body = r.text
    return json.loads(page_body)

In [141]:
# function that define a table that we later used for string processing
def define_table():
    """Function that return a table for string processing"""
    table = collections.defaultdict(lambda: None)
    table.update({
    ord('é'):'e',
    ord('ô'):'o',
    ord(' '):' ',
    ord('-'):' ',
    ord('_'):' ',
    ord('\N{NO-BREAK SPACE}'): ' ',
    ord('\N{EN SPACE}'): ' ',
    ord('\N{EM SPACE}'): ' ',
    ord('\N{THREE-PER-EM SPACE}'): ' ',
    ord('\N{FOUR-PER-EM SPACE}'): ' ',
    ord('\N{SIX-PER-EM SPACE}'): ' ',
    ord('\N{FIGURE SPACE}'): ' ',
    ord('\N{PUNCTUATION SPACE}'): ' ',
    ord('\N{THIN SPACE}'): ' ',
    ord('\N{HAIR SPACE}'): ' ',
    ord('\N{ZERO WIDTH SPACE}'): ' ',
    ord('\N{NARROW NO-BREAK SPACE}'): ' ',
    ord('\N{MEDIUM MATHEMATICAL SPACE}'): ' ',
    ord('\N{IDEOGRAPHIC SPACE}'): ' ',
    ord('\N{IDEOGRAPHIC HALF FILL SPACE}'): ' ',
    ord('\N{ZERO WIDTH NO-BREAK SPACE}'): ' ',
    ord('\N{TAG SPACE}'): ' ',
    })
    table.update(dict(zip(map(ord,string.ascii_uppercase), string.ascii_lowercase)))
    table.update(dict(zip(map(ord,string.ascii_lowercase), string.ascii_lowercase)))
    table.update(dict(zip(map(ord,string.digits), string.digits)))
    return table

# function to remove accents
def remove_accents(s):
    """remove accents of a string s"""
    unaccented_string = unidecode.unidecode(s)
    return unaccented_string

# function that reformat string flattening it: remove accents, lower case and remove undesired characters
def reformat_string(s):
    """reformat a string s stripping the strange characters"""
    s = str(s)
    s = remove_accents(s)
    s = s.lower()
    s = s.strip(',.-_')
    table = define_table()
    s = s.translate(table,)
    return s

# this function is NOT USED!!
def find_road_match(road_name,prob_limit,df):
    """Find a match between the searched road_name and the ones in the dataframe df"""
    road_name_found, prob=process.extractOne(road_name, name_df, scorer=fuzz.token_sort_ratio)
    if prob>=prob_limit: 
        return road_name_found
    else:
        print('road not found')
        return '?'
    
# In the following function the name matching is again (as for the risk) exact once we 
# have "flattened" the strings since we have seen that very similar names of road can 
# be two different roads (like replacing "chemin" with "route" gives already mismatches)
def get_road_risk(road_name, df,risk_type_col='Risk_NB_VOITURES_TOURISME'):
    """Given a road name, search fo its equivalent in the dataframe df
    and return it with the risk associated to the specified risk column."""
    road_name = reformat_string(road_name)
    if road_name in df.index:
        risk = df.loc[[road_name]][risk_type_col][0]
    else:
        risk = 0
    return [road_name,risk]

# see below for example of use
def find_streets_in_indications(s):
    """Extract the name of the roads from a tipical indication string"""
    opening = [m.start() for m in re.finditer('<b>', s)]
    closing = [m.start() for m in re.finditer('</b>', s)]
    streets = []
    for i in range (len(opening)):
        indication = s[opening[i]+3 : closing[i]]
        if indication[0].isupper(): # The name of the roads begin with upper case letter, not directions
            streets.append(indication)
    return streets

Example of use:

In [150]:
origin = 'Chemin+terroux' 
destination= 'Quai+du+Mont-Blanc+1201+Genève'
get_multiples_routes = 'True'

url = 'https://maps.googleapis.com/maps/api/directions/json?'+\
        'origin=' + origin  +\
        '&destination='+ destination  +\
        '&alternatives=' + get_multiples_routes +\
        '&key='+ API_KEY
        
my_json = request_get_json(url)
tmp2 = []
for i in range(len(my_json['routes'][0]['legs'][0]['steps'])):
    tmp = my_json['routes'][0]['legs'][0]['steps'][i]['html_instructions']
    tmp2.append(tmp)
    print (tmp)

print('\n ',len(my_json['routes']))
print('\nNow with the function:')
find_streets_in_indications(tmp2[0])

Head <b>northeast</b> on <b>Chemin Terroux</b> toward <b>Chemin Bonvent</b>
Continue onto <b>Chemin Edouard-Sarasin</b>
Turn <b>right</b> onto <b>Route de Ferney</b>
Continue onto <b>Avenue de France</b>
Keep <b>left</b> to stay on <b>Avenue de France</b>
Continue onto <b>Quai Wilson</b>
Continue onto <b>Quai Wald-Wilson</b>
Continue onto <b>Quai du Mont-Blanc</b>

  1

Now with the function:


['Chemin Terroux', 'Chemin Bonvent']

### 1.2 Wrapping up:

In [121]:
def find_routes_from_origin_to_destination(origin,destination):
    """Given the origin and destination, the function returns all the routes 
    that Google API tells us to take in order to reach destination"""
    # url of the request
    url = 'https://maps.googleapis.com/maps/api/directions/json?'+\
        'origin=' + origin  +\
        '&destination='+ destination  +\
        '&alternatives=' + get_multiples_routes +\
        '&key='+ API_KEY
    my_json = request_get_json(url)
    num_routes = len(my_json['routes'])
    print('Number of routes found: ', num_routes)
    if num_routes <1:
        return 0
    routes = []
    for k in range(0,num_routes):
        indications = []
        for i in range(len(my_json['routes'][k]['legs'][0]['steps'])):
            indication = my_json['routes'][k]['legs'][0]['steps'][i]['html_instructions']
            indication = find_streets_in_indications(indication)
            if len(indication)>1: # if multiple road names in indication, extract them one by one
                for x in indication:
                    indications.append(x)
            else:
                indications.append(indication[0])
        indications = list(set(indications)) # to remove duplications
        routes.append(indications)
    return routes

Example of use:

In [124]:
origin = 'Chemin+terroux' 
destination= 'Quai+du+Mont-Blanc+1201+Genève'

my_routes = find_routes_from_origin_to_destination(origin,destination)
print('Google API found ',len(my_routes),' different routes possibles')
print('Here is the look of the first:\n',my_routes[0])

Number of routes found:  3
Google API found  3  different routes possibles
Here is the look of the first:
 ['Chemin Edouard-Sarasin', 'Chemin Terroux', 'Quai Wald-Wilson', 'Quai du Mont-Blanc', 'Avenue de France', 'Quai Wilson', 'Chemin Bonvent', 'Route de Ferney']


Where we have supposed that google will indicate all the roads taken. From some trials it look like the case exect for "exessively" small roads and thus all the roads that a use would have to take are listed!

## 2. THE function:

In [147]:
def get_path_risk(origin,destination,risk_df,risk_col='Risk_NB_VOITURES_TOURISME'):
    """
    Compute the risk of an itinerary
    Inputs:
    :origin: Starting point of our itinerary as is should be given to google maps API
    One could find it by looking at the url after having made a request to google maps
    or one could just write key words separated by '+' and it should find the correct place
    :destination: Destination point of our itinerary as it should be given to google maps API
    :risk_df: The dataframe containing the risk per road
    :risk_col: Column with respect to which we wanna compute the risk. By default it is for the 
    'Voitures de tourisme' since it is the one that makes more sense.
    Output: Dataframe containing the risk for the itinerary and several other informations
    """
    d = {'Risk': [], 'mean risk': [],'zeros':[], 'all routes':[],'all risks':[],\
        'number of roads':[]}
    out = pd.DataFrame(data=d)
    list_of_trajectories = find_routes_from_origin_to_destination(origin=origin,destination=destination)
    for i,traj in enumerate(list_of_trajectories):
        routes = []
        risks = []
        count = 0
        for x in traj:
            route_risk = get_road_risk(road_name=x, df=risk_df,risk_type_col=risk_col)
            routes.append(route_risk[0])
            risks.append(route_risk[1])
            if route_risk[1]==0:
                count +=1
        filtered_risks = list(filter(lambda a: a != 0, risks))
        out = out.append({'Risk': sum(risks)+count, 'mean risk': np.mean(filtered_risks),'zeros':count,\
                          'all routes':routes,'all risks':risks,'number of roads':len(routes)}, ignore_index=True)
    cols = ['Risk','mean risk','number of roads','zeros','all routes','all risks']
    out = out[cols]
    return out

Example of use:

In [148]:
origin = 'Vermont,+1202+Geneva'
destination= 'Quai+du+Mont-Blanc+1201+Genève'
get_path_risk(origin,destination,risk_df=risk_df,risk_col='Risk_NB_VOITURES_TOURISME')#['all routes'][0]

Number of routes found:  1


Unnamed: 0,Risk,mean risk,number of roads,zeros,all routes,all risks
0,10.948168,1.982723,8.0,5.0,"[rue du vidollet, rue du valais, quai wald wil...","[0, 0, 0, 2.5023691597, 1.77502396276, 1.67077..."


**Notes about this function: **
It uses the risk computed previously which, as discussed, is already very uncertain!
Morevoer, as it can be seen in the table, only a small fraction of the roads has an equivalent in the risk table (they should not have a traffic detector in thoses roads) and here we did not take into account that mislead.
We should note how we have dealt with missing values:
* We counted them as 0 for the total risk
* We extract them for the computation of the mean
  

How we could go around this problem:
* taking data over more years (we recall that we took only data from 2015 for the limitations of requests imposed by Google maps)
* computing a risk per road without the traffic (it will thus not be scaled) but it will be available for more roads. We could then interpolate to "predict" the normalized risk for the roads where we did not have the traffic data.

**Other problem:** At some hours of the day (e.g. late at night), the Google API doesn't give multiple routes (or at least not to us)