# Run:
Only notebook to run ? 

In [1]:
%%configure
{"pyFiles": ["/user/gottraux/dijkstra_algorithms.py"],
 "conf": {
    "spark.app.name": "dslab-group_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
8983,application_1589299642358_3520,pyspark,idle,Link,Link,
8996,application_1589299642358_3534,pyspark,idle,Link,Link,
9028,application_1589299642358_3573,pyspark,idle,Link,Link,
9035,application_1589299642358_3582,pyspark,idle,Link,Link,
9054,application_1589299642358_3599,pyspark,idle,Link,Link,
9055,application_1589299642358_3601,pyspark,idle,Link,Link,
9059,application_1589299642358_3605,pyspark,idle,Link,Link,
9065,application_1589299642358_3611,pyspark,idle,Link,Link,
9070,application_1589299642358_3618,pyspark,idle,Link,Link,
9071,application_1589299642358_3620,pyspark,idle,Link,Link,


### Imports and helper functions:

In [2]:
import pickle
import json
import time
import networkx as nx
import pandas as pd
from pyspark.sql.functions import col

"""
To load (or reload) into hdfs:
hdfs dfs -rm /user/${JUPYTERHUB_USER}/dijkstra_algorithms.py 2>/dev/null
hdfs dfs -copyFromLocal notebooks/dijkstra_algorithms.py /user/${JUPYTERHUB_USER}/
"""
from dijkstra_algorithms import *

MAX_TRIP_DURATION = 2 #duration in hour 

days_dict = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday'}
def day_trips(*day_ids):
    """
    day_trips: gives the trip_ids that operate on certain days
    input: a variable number of day ids
    output:s spark dataframe with trip_ids
    
    """
    days = [days_dict[day_id] for day_id in day_ids]
    where_clause = " and ".join(days)

    day_services = calendar.where(where_clause).select('service_id')
    return day_services.join(trips, on='service_id').select('trip_id')

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9099,application_1589299642358_3661,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
%%local
import os
import pandas as pd
username = 'gottraux'

In [4]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

## Load graph data

In [5]:
def convertToMinute(s):
    h, m = s.split(':')
    h,m = int(h), int(m)
    
    return h*60+m

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
trips = spark.read.format('orc').load('/data/sbb/timetables/orc/trips/000000_0')
calendar = spark.read.format('orc').load('/data/sbb/timetables/orc/calendar/000000_0')
stop_times = spark.read.format('orc').load('/data/sbb/timetables/orc/stop_times/000000_0')
stops = spark.read.format('orc').load('/data/sbb/timetables/orc/stops/000000_0')

nodes_df = spark.read.orc("/user/{}/nodes.orc".format(username))
edges_df = spark.read.orc("/user/{}/edges_with_mean_and_std_sec.orc".format(username))

#durations_dicts = json.loads(sc.textFile('/user/{}/durations_for_confidence_.json'.format(username)).collect()[0])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
edges_df.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------+----------+------------+--------------+---------+-------------+----+----+
|             trip_id|stop_id|train_type|arrival_time|departure_time|next_stop|trip_duration|mean| std|
+--------------------+-------+----------+------------+--------------+---------+-------------+----+----+
|30.TA.30-24-Y-j19...|8503111|       Bus|         496|           496|  8503102|          4.0|null|null|
|34.TA.30-24-Y-j19...|8503111|       Bus|         539|           539|  8503102|          4.0|null|null|
|31.TA.30-31-Y-j19...|8503111|       Bus|         532|           532|  8503103|          5.0|null|null|
+--------------------+-------+----------+------------+--------------+---------+-------------+----+----+
only showing top 3 rows

In [8]:
edges_df = edges_df.withColumnRenamed('stop_id', 'temp').withColumnRenamed('next_stop', 'stop_id').withColumnRenamed('temp', 'next_stop')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
edges_df.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------+----------+------------+--------------+-------+-------------+----+----+
|             trip_id|next_stop|train_type|arrival_time|departure_time|stop_id|trip_duration|mean| std|
+--------------------+---------+----------+------------+--------------+-------+-------------+----+----+
|30.TA.30-24-Y-j19...|  8503111|       Bus|         496|           496|8503102|          4.0|null|null|
|34.TA.30-24-Y-j19...|  8503111|       Bus|         539|           539|8503102|          4.0|null|null|
|31.TA.30-31-Y-j19...|  8503111|       Bus|         532|           532|8503103|          5.0|null|null|
+--------------------+---------+----------+------------+--------------+-------+-------------+----+----+
only showing top 3 rows

In [10]:
nodes = nodes_df.rdd.map(lambda r: (r[0], {'name': r['stop_name'],
                                              'lat': r['stop_lat'],
                                              'lon': r['stop_lon']})).collect()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
%%local
walking_times = pd.read_pickle('walking_edges.pickle')

In [12]:
%send_to_spark -i walking_times -t df -m 20000

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'walking_times' as 'walking_times' to Spark kernel

In [13]:
# reverse edges:
walking_times = (walking_times.withColumnRenamed('source', 'temp').withColumnRenamed('target', 'source').withColumnRenamed('temp', 'target'))

edges_walking = walking_times.toPandas()
edges_walking['attrs'] = edges_walking.apply(lambda x: {'time': -1, 'duration': x['walk_duration']}, axis=1)
edges_walking = list(edges_walking[['source', 'target', 'attrs']].to_numpy())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
walking_times.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+---------------+-------------+
|         target|         source|walk_duration|
+---------------+---------------+-------------+
|8503000:0:41/42|8503000:0:43/44|          7.0|
|8503000:0:41/42|   8503000:0:14|          7.0|
|8503000:0:41/42|   8503000:0:16|          7.0|
+---------------+---------------+-------------+
only showing top 3 rows

Remove dics from dijkstra time for the moment and make it return the mean and std as well because we need it to validate. 

## Choose time of arrival

In [15]:
day_id, arrival_hour, arrival_minute = 4, 12, 30

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Create graph

In [16]:
def create_edges_for_trip(edges_df, day_id, arrival_time):
    """
    create_edges_for_trip: constructs edges (and thus trips) that exist in a window of two hours before a given input time
    @input:
    - edges_df: df from which we construct the edges
    - day_id: id of week-day (e.g. wednesday is day id 2, see dictionnary above)
    - hour, minute: time at which we want to arrive somewhere (e.g. 11:30)
    @output: data frame of selected edges
    """
    #select only the trips that occur on that day:
    edges_df= edges_df.join(day_trips(day_id), on='trip_id')
    
    min_dep_time = arrival_time - 60*MAX_TRIP_DURATION
    
    #keep only those in a window of two hours:
    edges_df = edges_df.filter((col('departure_time') > min_dep_time) & 
                                            (col('arrival_time') <= arrival_time))
    
    edges = edges_df.rdd.map(lambda r: (r['stop_id'], r['next_stop'], {'duration': r['trip_duration'],
                                                                       'time': float(r['departure_time']),
                                                                       'trip_id': r['trip_id'],
                                                                       'mean': r['mean'],
                                                                       'std': r['std'],
                                                                       'transport': r['train_type']})).collect()
    
    return edges + edges_walking

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
edges = create_edges_for_trip(edges_df, day_id, arrival_hour*60+arrival_minute)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
graph = nx.MultiDiGraph()
graph.add_nodes_from(nodes)
graph.add_edges_from(edges)

old_number_of_nodes = graph.number_of_nodes()
# Remove unreachable nodes
dists, paths = normal_dijkstra(graph, '8503000')
not_reachable = set(graph.nodes) - set(dists.keys())
_ = graph.remove_nodes_from(list(not_reachable))
print('{} nodes removed'.format(old_number_of_nodes - graph.number_of_nodes()))

# Temp for problem of name's encoding
import unicodedata
nodes_data = graph.nodes(data=True)
for n in graph.nodes:
    nodes_data[n]['name'] = unicodedata.normalize('NFKD', nodes_data[n]['name']).encode('ascii','ignore')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

18 nodes removed

## dijkstra:

In [19]:
def dijkstra_reversed(G, first_source, arrival_time, last_target, confidence=None, 
                       confidence_step=0.01, durations_dicts=None):
    G = G.copy()
    departure_time = -arrival_time

    # inverse direction 
    temp = first_source
    first_source = last_target
    last_target = temp
        
    if not G.is_directed():
        raise ValueError('Input graph is not directed while it should be.')

    G_succ = G.succ 
        
    # paths stores the nodes in dijkstra's shortest path
    paths = {first_source: [first_source]}
    # stores the edges in dijkstra's shortest path
    e_paths = {first_source: []}
        
    # dictionary of final distances to nodes
    dist = {}  
        
    # dictionnary of whether it's the first time a node is visited
    seen = {first_source: departure_time}

    # use heapq with (distance,label) tuples
    push = heappush
    pop = heappop
    c = count()
    fringe = []  
        
    # push the source as the first node on the heap
    push(fringe, (departure_time, next(c), first_source))
    # while heap not empty
    while True:
        while fringe:
            # take the node to look at: 
            (d, _, source) = pop(fringe)

             # check if node has already been looked at and has a final shortest distance: 
            if source in dist:
                  continue  # already searched this node so go to another

            # take the distance to the node from the heap 
            # source starts with distance = departure_time
            dist[source] = d

            #stop if the source is the last_target. 
            if source == last_target:
                break

            # Look at all direct descendents from the source node: 
            for target, edges in G_succ[source].items():
                # Because it's a multigraph, need to look at all edges between two nodes:
                for edge_id in edges:  
                    
                    dep_time_edge = - G.get_edge_data(source, target, edge_id)['time']

                    # Get the duration between two nodes:
                    duration_cost = G.get_edge_data(source, target, edge_id)['duration']
                     
                    # Check if walking edge: 
                    # walking edges have a departure time of -1
                    if dep_time_edge == 1:
                        walking_edge = True
                        current_trip_id = None
                        
                        # set the departure time to the 
                        # distance to that node as we can leave immediatly
                        dep_time_edge = d + duration_cost                  

                    else:
                        walking_edge = False
                        current_trip_id = G.get_edge_data(source, target, edge_id)['trip_id']


                    if duration_cost is None:
                            raise ValueError('Edge without a duration.')

                    # Add the weight to the current distance to a node

                    current_dist = dep_time_edge           
                    # take only edges that have a departure time bigger 
                    # than the time it takes to get to the node

                    if dep_time_edge - duration_cost < dist[source]:
                            # move on to next edge if it's earlier 
                        continue

                    # Check if edge is feasible (also accoring to confidence)
                    # Check if last edge taken was not a walking edge
                    # Check if there is at least a path of length 1 to the source node 
                    # (e.g. that this node is not the original source)
                    if len(e_paths[source]) >= 1 and not walking_edge:
                        last_edge_source, last_edge_target, last_edge_info = e_paths[source][-1]
                        mean = G.get_edge_data(source, target, edge_id)['mean']
                        std = G.get_edge_data(source, target, edge_id)['std']
                        # now compute delay for current edge because in reverse
                        last_delay = compute_delay_uncertainty(mean, std, confidence)

                        # If we make a transport-> walk change
                        if last_edge_info['walk']:
                            # add delay to departure time of walk as we will leave later
                            dep_time_edge -= last_delay
                        else:
                            # If we make a transport->transport change, check if we have time to change
                            # To change we need that the next connection leaves >= 2 min + delay of transport
                            # If not we cannot take that edge
                            #if current_trip_id != last_edge_info['trip_id']\
                            #and dep_time_edge < dist[source] + 2 + last_delay:
                            #   continue
                            if current_trip_id != last_edge_info['trip_id'] and dep_time_edge - last_delay - 2 - duration_cost < dist[source]:
                                continue                

                    # if target has already been visited once and has a final distance:
                    if target in dist:
                            # if we find a distance smaller than the actual distance in dic
                            # raise error because dic distances contains only final distances
                            if current_dist < dist[target]:
                                raise ValueError('Contradictory paths found:','negative weights?')

                    # either node has been seen before or the current distance is smaller than the 
                    # proposed distance in seen[target]:
                    if target not in seen or current_dist < seen[target]:

                        # update the seen distance
                        seen[target] = current_dist
                        # push it onto the heap so that we will look at its descendants later
                        push(fringe, (current_dist, next(c), target))

                        # update the paths till target:
                        if paths is not None:

                            edge_dict = G.get_edge_data(source, target, edge_id)

                            edge_dict['walk'] = walking_edge
                            edge_dict['departure_time'] = dep_time_edge
                            e_paths[target] = e_paths[source] + [(source, target, edge_dict)]    
        
        # If there is no path to the last_target:
        if  last_target not in e_paths:
            print('Error: No paths to the source')
            return pd.DataFrame(columns=['from', 'from_id', 'to', 'to_id', 'duration', 
                                             'departure_time', 'walk', 'no_change', 'mean_std_null','mean','std'])

            
        # Construct path's data structure
        if paths is not None:
            nodes_data = G.nodes(data=True)
            arrival_string = minute_to_string(dist[last_target])

            #reverse path:
            best_path = []
            for edge in e_paths[last_target][::-1]: 
                edge = (edge[1], edge[0], edge[2])
                best_path.append(edge)           
            num_edges = len(best_path)
            departure_string = minute_to_string(-best_path[0][2]['departure_time'])

            best_path_df = pd.DataFrame(columns=['from', 'from_id', 'to', 'to_id', 'duration',
                                              'departure_time', 'walk', 'no_change', 'mean_std_null', 'mean','std'])
            last_edge_info = False
            for source, target, edge_info in best_path:
                no_change = ('trip_id' in edge_info                                   # We're in a transport
                             and last_edge_info and 'trip_id' in last_edge_info       # and last edge also
                             and last_edge_info['trip_id'] == edge_info['trip_id'])   # and same trip_id
                mean_std_null = 'trip_id' in edge_info and 'mean' not in edge_info or 'std' not in edge_info

                if not mean_std_null:
                    mean = edge_info['mean']
                    std = edge_info['std']
                    if  edge_info['mean'] == None or  edge_info['std'] == None: 
                        mean = edge_info['duration']
                        std = 0
                if 'mean' not in edge_info or 'std' not in edge_info:
                    mean = edge_info['duration']
                    std = 0

                current_path_dict = {'from': nodes_data[source]['name'],
                                     'from_id': source, 
                                     'to': nodes_data[target]['name'], 
                                     'to_id': target, 
                                     'duration': edge_info['duration'], 
                                     'departure_time': minute_to_string(-edge_info['departure_time']), 
                                     'walk':edge_info['walk'], 
                                     'no_change': no_change, 
                                     'mean_std_null': mean_std_null,
                                    'mean':mean,
                                    'std':std,
                                    'transport':edge_info['transport'] if not edge_info['walk'] else 'Walk'}
                best_path_df = best_path_df.append(current_path_dict, ignore_index=True)
                last_edge_info = edge_info

            # Validation: 
            if confidence != None and not validate_path_(best_path_df, confidence):
                # else increase confidence by a confidence step and start again: 
                confidence += 0.01
                print(confidence)
                continue
                
            # If path validated, print it
            print('Going from {} ({}) to {} ({}) in {:.2f} minutes, departure at {}, arrival at {}'.format(nodes_data[last_target]['name'],
                                                                                          last_target,
                                                                                          nodes_data[first_source]['name'],
                                                                                          first_source, 
                                                                                          arrival_time + best_path[0][2]['departure_time'],
                                                                                          minute_to_string(-best_path[0][2]['departure_time']),
                                                                                            minute_to_string(-best_path[num_edges-1][2]['departure_time']+
                                                                                                            best_path[num_edges-1][2]['duration'])))
            with pd.option_context('display.max_rows', None, 
                                   'display.max_columns', None, 
                                   'display.max_colwidth', 15,
                                   'display.expand_frame_repr', False):
                    print(best_path_df)
            return best_path_df
    raise ValueError('Should not be here')
    return e_paths[last_target], dist[last_target]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### Validation:

##### Feasible paths:
Create a function that looks through a path to see if it is valid. 
So it looks for:
- missed connections
- transfer time of less than 2 minutes between two transports

In [20]:
"""Returns true if there is time to take all edges, and if 
when chaning from a connection to another you have at least 2 minutes. """

def is_path_valid(path):
    last_target = path['from_id'][len(path['from_id'])-1]
    time = convertToMinute(path['departure_time'][0]) + path['duration'][0]
    
    for i in range(1, len(path['from_id'])):
        #in case an edge taken actually left before we got there (only for transport edges, not for walks)
        if not path['walk'][i] and convertToMinute(path['departure_time'][i]) < time:
            print('You miss this connection. Time is {} while this edge leaves at {} from {} to {}'\
                  .format(minute_to_string(time), path['departure_time'][i], path['from'][i], path['to'][i]))
            return False
        
        #in case of change type transport -> trasnport need 2 minutes transfer:
        if not path['no_change'][i] and not path['walk'][i]:
            if not path['walk'][i-1]:
                if convertToMinute(path['departure_time'][i]) < time + 2:
                    print('You do not have time to change to this connection between {} to {} leaving at {}. You arrive at {} and need at least 2 min transfer'\
                          .format(path['from'][i],path['to'][i], path['departure_time'][i], minute_to_string(time)))
                    return False
        
        else: 
            time = convertToMinute(path['departure_time'][i]) + path['duration'][i]
    return True

# test of is path valid:
#assert(is_path_valid(best_path1))
#assert(is_path_valid(best_path2))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

##### Validate a path:
Then for a given path, we sample felays for transfers where we go from a transport -> walk or transport -> transport. 

For transport 1 -> transport 2: the delay of transport 1 will be added to its trip duration
For transport -> walk: the delay of transport will be added to the departure time of walk 

After modifying these values, we check whether the path is still feasible. We repeat this operation a ceertain number of times and report the percentage of feasible paths. 

For the moment, delays are sampled from an absolute normal distribution (**?good?**). 

In [21]:
def validate_path_(path, confidence):
    num_tries = 10
    num_valids = 0
    
    for i in range(num_tries):
        path_copy = path.copy()
        for i in range(len(path['from_id'])):
            #only for transfers etiher to other trains or to walking: 
            if i > 1 and not path['no_change'][i]:
                mean = path['mean'][i-1]
                std = path['std'][i-1]
                #sample a delay:
                #delay = compute_delay_uncertainty(mean, std, confidence)
                
                # calcluate delay for connection of before:
                if std != 0:
                    delay = np.random.normal(mean, std)
                    if delay <0:
                        delay = 0
                else: delay = 0
                
                # if its between two transports we just add it to trip duration:
                if not path['walk'][i] and not path['walk'][i-1]:
                    path_copy['duration'][i-1] += delay
                
                # transfer from trans to walk:
                if not path['walk'][i-1] and path['walk'][i]:
                    # if a train to a walk is delayed, the walk needs to leave later:
                    #need to leave at the time it takes for the delayed connection to arrive, 
                    # so if delayed need to start walking later: 
                    
                    #add duration to transp: 
                    path_copy['duration'][i-1] += delay
                    
                    #delay the start of walk:
                    arrival_of_edge_before = path_copy['duration'][i-1]+convertToMinute(path_copy['departure_time'][i-1])
                    
                    # need to start later:
                    new_dep_time = minute_to_string(arrival_of_edge_before)
                    path_copy['departure_time'][i] = new_dep_time
        
        if is_path_valid(path_copy):
            num_valids += 1
        perc = num_valids/float(num_tries)
    return perc >= confidence

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Create duration dictionnaries if needed

Code commented, don't have the permission to **change** a file, can write if put another path

durations_dicts = {}
edge_and_data_tuple = zip(graph.edges(keys=True),
              map(lambda x: x[2], graph.edges(data=True)))
edge_and_data_tuple = filter(lambda x: 'mean' in x[1] and 'std' in x[1], edge_and_data_tuple)
for c in [0.25, 0.5, 0.75, 0.9, 0.95, 0.98, 0.99]:
    durations_dicts[c] = {e: {'duration': data['mean'] + compute_delay_uncertainty(data['mean'], 
                                                                                            data['std'], 
                                                                                            c)
                                       if data['mean'] != None and data['std'] != None
                                       else data['duration']
                                      } for e, data in edge_and_data_tuple}
    
""" Conversion to json """
durations_dicts_for_json = {}
for c in durations_dicts.keys():
    durations_dicts_for_json[c] = {str(k): v for k, v in durations_dicts[c].items()}

print('Length of json:', len(json.dumps(durations_dicts_for_json))) -> 30106955

""" Save to hdfs """
sc.parallelize([json.dumps(durations_dicts_for_json)]).coalesce(1).saveAsTextFile('/user/{}/durations_for_confidence_.json'.format(username))

## Run Dijkstra reversed algorithm: 

In [22]:
# Test Tao 1: 
"""
Route 1: HB -> Auszelg
20.TA.26-9-A-j19-1.2.H: 8503000:0:41/42 at 12:07:00 ~ 8503310:0:3 at 12:17:00
Walking: 8503310:0:3 ~ 8590620
168.TA.26-12-A-j19-1.2.H: 8590620 at 12:23:00 ~ 8591049 at 12:29:00
"""
best_path1 = dijkstra_reversed(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049', confidence=0.99)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich HB (8503000) to Zurich, Auzelg (8591049) in 25.14 minutes, departure at 12:04, arrival at 12:29
             from         from_id              to           to_id  duration departure_time   walk no_change mean_std_null      mean      std transport
0       Zurich HB         8503000       Zurich HB  8503000:0:4...  2.135259          12:04   True     False          True  2.135259        0      Walk
1       Zurich HB  8503000:0:4...  Zurich Hard...     8503020:0:3  2.000000          12:07  False     False         False  0.336499  1.09469    S-Bahn
2  Zurich Hard...     8503020:0:3  Zurich Oerl...     8503006:0:8  5.000000          12:09  False      True         False  5.000000        0    S-Bahn
3  Zurich Oerl...     8503006:0:8      Glattbrugg     8503310:0:3  2.000000          12:15  False      True         False  2.000000        0    S-Bahn
4      Glattbrugg     8503310:0:3  Glattbrugg,...         8590620  3.063448          12:19   True     False          True  3.063448

In [23]:
# Test 2: 
"""
Route 2:
Zurich, Triemli (8503610) to Zurich Altstetten, Bahnhof N (8591057)
"""
#From HB zu Auszelg:
best_path1 = dijkstra_reversed(graph, '8503610', arrival_hour*60+arrival_minute, last_target='8591057', confidence=0.98)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich, Triemli (8503610) to Zurich Altstetten, Bahnhof N (8591057) in 22.00 minutes, departure at 12:08, arrival at 12:30
             from  from_id              to    to_id  duration departure_time   walk no_change mean_std_null      mean       std transport
0  Zurich, Tri...  8503610  Zurich, In ...  8591214  2.000000          12:08  False     False         False  1.470458  2.517784       Bus
1  Zurich, In ...  8591214  Zurich, Gol...  8591163  1.000000          12:10  False      True         False  1.338457  1.940935       Bus
2  Zurich, Gol...  8591163  Zurich, Alb...  8591036  1.000000          12:11  False      True         False  1.284314  2.041064       Bus
3  Zurich, Alb...  8591036  Zurich, Alb...  8591037  1.000000          12:12  False      True         False  1.276941  2.026404       Bus
4  Zurich, Alb...  8591037  Zurich, Unt...  8591408  1.000000          12:13  False      True         False  1.053628  1.640792       Bus
5  Zurich, Unt...  8591408  Zurich, Ra

#### Algorithm improv pour arrival time:

In [24]:
decrements = [10, 5, 2, 1, 0]
source = '8503000'
target = '8591122'
day_id, arrival_hour, arrival_minute = 4, 12, 30

arrival_time = arrival_hour*60+arrival_minute

def depart_time(arrival_time, source, target, decrements):
    #start with two hours before:
    dep = arrival_time - MAX_TRIP_DURATION*60
    path = dijkstra_with_time(graph, source, arrival_hour*60+arrival_minute, 
                               last_target=target, confidence=0.98, departure_time = dep)
    num_edges = len(path['total_duration'])
    duration = path['total_duration'][num_edges-1]

    first_arrival = convertToMinute(path['departure_time'][num_edges-1]) + path['duration'][num_edges-1]
    arrival_diff = arrival_time - first_arrival
    
    departures = [dep]
    
    if arrival_diff < 0:
        raise ErrorValue('Arrives after wanted arrival time')
    for i in decrements:
        dep = arrival_time - duration - i
        path = dijkstra_with_time(graph, source, arrival_hour*60+arrival_minute, 
                               last_target=target, confidence=0.98, departure_time = dep)
        num_edges = len(path['total_duration'])
        duration = path['total_duration'][num_edges-1]

        first_arrival = convertToMinute(path['departure_time'][num_edges-1]) + path['duration'][num_edges-1]
        arrival_diff = arrival_time - first_arrival
        
        if arrival_diff < 0:
            return departures[0]
        
        departures.insert(0, dep)

        print('Start with {} minutes difference'.format(arrival_diff))
        print('Trip takes {} minutes'.format(duration))

        
    return departures[0]


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Visualisation

In [25]:
%%local
import ipywidgets as widgets
import fuzzy_pandas as fpd
import time

def search_station(station):
    search = pd.DataFrame([station], columns=['station'])
    matches = fpd.fuzzy_merge(search, stations_name, left_on='station', right_on='stop_name',
                              ignore_case=True, ignore_nonalpha=True, ignore_nonlatin=True, ignore_order_words=True,
                              keep='match', threshold=0.8, method='jaro')
    return matches['stop_name'].to_list()

def search_station_departure(sender):
    phrase = depart_station.value
    depart_proposals.options = search_station(phrase)
    
def search_station_arrival(sender):
    phrase = arrive_station.value
    arrive_proposals.options = search_station(phrase)
    
no_station_selected = "None selected"

def select_station_departure(sender):
    if(sender['name'] == 'label'):
        if(sender['new'] == None):
            selected_depart_station.value = no_station_selected
        else:
            selected_depart_station.value = sender['new']
            
def select_station_arrival(sender):
    if(sender['name'] == 'label'):
        if(sender['new'] == None):
            selected_arrival_station.value = no_station_selected
        else:
            selected_arrival_station.value = sender['new']
            
def find_route_button(button):
    # Parse arguments
    depart_station_str = selected_depart_station.value
    if depart_station_str == no_station_selected:
        report_error("No departure station selected")
        return
    
    arrive_station_str = selected_arrival_station.value
    if arrive_station_str == no_station_selected:
        report_error("No arrival station selected")
        return
    
    date = date_picker.value
    if(date == None):
        report_error("No date selected")
        return
    
    if(date.weekday() > 4):
        report_error("Date is a weekend day, please select a week day")
        return
    
    hour_str = hour_picker.value
    
    if hour_str == None or hour_str == "":
        report_error("No hour selected")
        return
    
    hour_str = hour_str.split(':')
    hour = -1
    minute = -1
    
    try:
        if(len(hour_str) != 2):
            raise Error
        hour = int(hour_str[0])
        minute = int(hour_str[1])        
    except:
        report_error("Invalid hour format, use HH:MM")
        return
    
    if(hour not in range(8,21)):
        report_error("Invalid hour, valid range: [8,20]")
        return
            
    if(minute not in range(0,60)):
        report_error("Invalid minute, valid range: [0,59]")
        return
        
    confidence = confidence_picker.value
        
    report_error(None)
    
    
    # Show progress bar
    results.children = []
    progress_bar.layout = widgets.Layout(display='block')
    
    # Send variables to spark
    # Convert to str for 'send_to_spark'
    date_str = date.strftime("%Y-%m-%d")
    hour_str = str(hour)
    minute_str = str(minute)
    confidence_str = str(confidence)
    get_ipython().push(['depart_station_str', 'arrive_station_str', 'date_str', 'hour_str', 'minute_str', 'confidence_str'])
    get_ipython().run_cell_magic('send_to_spark', ' -i depart_station_str -t str -n depart_station_str', ' ')
    get_ipython().run_cell_magic('send_to_spark', ' -i arrive_station_str -t str -n arrive_station_str', ' ')
    get_ipython().run_cell_magic('send_to_spark', ' -i date_str -t str -n date_str', ' ')
    get_ipython().run_cell_magic('send_to_spark', ' -i hour_str -t str -n hour_str', ' ')
    get_ipython().run_cell_magic('send_to_spark', ' -i minute_str -t str -n minute_str', ' ')
    get_ipython().run_cell_magic('send_to_spark', ' -i confidence_str -t str -n confidence_str', ' ')

    # Run algorithm
    get_ipython().run_cell_magic('spark', '', """
    result = get_result_path(depart_station_str, arrive_station_str, date_str, hour_str, minute_str, confidence_str)
    """)
    
    # Retrive results from spark
    get_ipython().run_cell_magic('spark', '-o result', ' ')
    
    # Display path
    progress_bar.layout = widgets.Layout(display='none')
    display_path(result)
    
def display_path(path):
    stops = []
    
    for index, row in path.iterrows():
        start = row['from']
        end = row['to']
        duration = row['duration']
        departure_time = row['departure_time'].strftime("%H:%M")
        walk = row['walk']
        
        walking = "Walking " if walk else ""
        
        stops.append(widgets.HTML(value=f"<p style='font-size: 20px; padding-bottom:10px;'>{departure_time}: {start} <b>&rarr;</b> {end}, {duration:.1f} minutes {walking}</p>"))
        
    results.children = stops
    
def report_error(error_message):
    if error_message == None:
        error.value = ""
    else:
        error.value = "<b style='color:red;'>Error: " + error_message  + "</b>"

In [26]:
def get_result_path(start_str, end_str, date_str, hour_str, minute_str, confidence_str):
    day_id = time.strptime(date_str, "%Y-%m-%d").tm_wday
    
    arrival_hour = int(hour_str)
    arrival_minute = int(minute_str)
    arrival_time = arrival_hour*60+arrival_minute
    departure_time = arrival_time - MAX_TRIP_DURATION*60
    
    confidence = int(confidence_str) / 100.0
    
    source = nodes_df.where(col('stop_name') == start_str).take(1)[0][0][:7]
    target = nodes_df.where(col('stop_name') == end_str).take(1)[0][0][:7]
    
    # Put in function
    edges = create_edges_for_trip(edges_df, day_id, arrival_time)
    graph = nx.MultiDiGraph()
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)

    old_number_of_nodes = graph.number_of_nodes()
    # Remove unreachable nodes
    dists, paths = normal_dijkstra(graph, '8503000')
    not_reachable = set(graph.nodes) - set(dists.keys())
    _ = graph.remove_nodes_from(list(not_reachable))
    print('{} nodes removed'.format(old_number_of_nodes - graph.number_of_nodes()))

    # Temp for problem of name's encoding
    import unicodedata
    nodes_data = graph.nodes(data=True)
    for n in graph.nodes:
        nodes_data[n]['name'] = unicodedata.normalize('NFKD', nodes_data[n]['name']).encode('ascii','ignore')
    
    
    print("Ran on cluster !")
    print("From {} to {} on {} at {}:{} for confidence {}".format(start_str, end_str, date_str, hour_str, minute_str, confidence_str))
    
    path = dijkstra_reversed(graph, source, arrival_time, last_target=target, confidence=confidence)
    return spark.createDataFrame(path)

stations_name = nodes_df.select('stop_name').distinct()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [27]:
%%spark -o stations_name

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
%%local
style = {'description_width': 'initial'}

# Search station
depart_station = widgets.Text(description = 'Search departure station',
                              layout=widgets.Layout(width='40%'),
                              style=style)
depart_station.observe(search_station_departure)
arrive_station = widgets.Text(description = 'Search arrival station',
                              layout=widgets.Layout(width='40%'),
                              style=style)
arrive_station.observe(search_station_arrival)


# Proposals
depart_proposals = widgets.Select(description = 'Found stations',
                                  layout=widgets.Layout(width='40%', height='200px'),
                                  style=style)
depart_proposals.observe(select_station_departure)
arrive_proposals = widgets.Select(description = 'Found stations',
                                  layout=widgets.Layout(width='40%', height='200px'),
                                  style=style)
arrive_proposals.observe(select_station_arrival)


# Stations
selected_depart_station = widgets.Label(value = no_station_selected, style=style)
selected_box_depart_station = widgets.HBox([widgets.Label(value = "Selected depart station: ", style=style),
                                             selected_depart_station], layout=widgets.Layout(width='40%'))
selected_arrival_station = widgets.Label(value = no_station_selected, style=style)
selected_box_arrival_station = widgets.HBox([widgets.Label(value = "Selected arrival station: ", style=style),
                                             selected_arrival_station], layout=widgets.Layout(width='40%'))



# Options
date_picker = widgets.DatePicker(
                    description='Pick a Date',
                    disabled=False,
                    layout=widgets.Layout(width='20%')
                )
hour_picker = widgets.Text(description = 'Arrival time',
                            placeholder='HH:MM',
                            layout=widgets.Layout(width='20%'),
                            style=style
                          )
confidence_picker = widgets.IntSlider(
            value=90,
            min=0,
            max=99,
            step=1,
            description='Confidence:',
            disabled=False,
            continuous_update=False,
            orientation='horizontal',
            readout=True,
            readout_format='d',
            layout=widgets.Layout(width='25%'),
            style=style
        )
search_button = widgets.Button(
            description='Find route',
            disabled=False,
            button_style='', # 'success', 'info', 'warning', 'danger' or ''
            tooltip='Find route',
            icon='check', # (FontAwesome names without the `fa-` prefix)
            layout=widgets.Layout(width='15%')
        )
search_button.on_click(find_route_button)


# Error
error = widgets.HTML(value="")


padding = widgets.HTML(value="", layout=widgets.Layout(height='50px'))

# Progress bar
progress_bar = widgets.HTML(value="Finding best route...", layout=widgets.Layout(display='none'))

# Result
results = widgets.VBox([])

stations = widgets.HBox([depart_station, arrive_station])
proposals = widgets.HBox([depart_proposals, arrive_proposals])
selected_stations = widgets.HBox([selected_box_depart_station, selected_box_arrival_station])
options = widgets.HBox([date_picker, hour_picker, confidence_picker, search_button])
layout = widgets.VBox([stations, proposals, selected_stations, options, error, padding, progress_bar, results])

layout

VBox(children=(HBox(children=(Text(value='', description='Search departure station', layout=Layout(width='40%'…