In [1]:
%%configure
{"conf": {
    "spark.app.name": "dslab-group_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6532,application_1589299642358_1021,pyspark,busy,Link,Link,
6545,application_1589299642358_1034,pyspark,idle,Link,Link,
6555,application_1589299642358_1044,pyspark,idle,Link,Link,
6560,application_1589299642358_1049,pyspark,idle,Link,Link,
6561,application_1589299642358_1050,pyspark,idle,Link,Link,
6562,application_1589299642358_1051,pyspark,busy,Link,Link,
6563,application_1589299642358_1052,pyspark,idle,Link,Link,
6564,application_1589299642358_1053,pyspark,idle,Link,Link,
6567,application_1589299642358_1056,pyspark,idle,Link,Link,
6568,application_1589299642358_1057,pyspark,idle,Link,Link,


## Imports and helper functions

In [34]:
import networkx as nx
from heapq import heappush, heappop
from itertools import count
from pyspark.sql.functions import col

days_dict = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday'}
def day_trips(*day_ids):
    """
    day_trips: gives the trip_ids that operate on certain days
    input: a variable number of day ids
    output:s spark dataframe with trip_ids
    
    """
    days = [days_dict[day_id] for day_id in day_ids]
    where_clause = " and ".join(days)

    day_services = calendar.where(where_clause).select('service_id')
    return day_services.join(trips, on='service_id').select('trip_id')

def minute_to_string(m):
    hour, minute = m // 60, m - 60*(m//60)
    time_string = '{:02}:{:02}'.format(int(hour), int(minute))
    
    return time_string

def string_to_minute(s):
    h, m, _ = s.split(':')
    h,m = int(h), int(m)
    
    return h*60+m

def get_time(graph, source, target, j):
    attr = graph.edges[(source, target, j)]
    return attr['time']

def get_weight(graph, source, target, j):
    attr = graph.edges[(source, target, j)]
    return attr['duration']

def normal_dijkstra(G, first_source, paths=None, cutoff=None, last_target=None):
    
    G_succ = G.succ if G.is_directed() else G.adj
    paths = {first_source: [first_source]}

    push = heappush
    pop = heappop
    dist = {}  # dictionary of final distances
    
    # dictionnary of wthether it's the first time a node is visited
    seen = {first_source: 0}

    c = count()
    fringe = []  # use heapq with (distance,label) tuples
    push(fringe, (0, next(c), first_source))
    
    while fringe:
        #take the node to look at: 
        (d, _, source) = pop(fringe)
        
        # check if node has already been looked at: 
        if source in dist:
            continue  # already searched this node.
        
        # update the distance of the node
        dist[source] = d
        
        #stop if the node we look at is the target obviously
        if source == last_target:
            break
            
        # Look at all direct descendents from the source node: 
        for target, edges in G_succ[source].items():
            # Because it's a multigraph, need to look at all edges between two nodes:
            for edge_id in edges:
                
                # Get the duration between two nodes:
                cost = get_weight_custom(G, source, target, edge_id)
                
                if cost is None:
                        continue
                
                # Add the weight to the current distance of a node
                current_dist = dist[source] + get_weight_custom(G, source, target, edge_id)
                
                # if target has already been visited once and has a final distance:
                if target in dist:
                        # if we find a distance smaller than the actual distance in dic
                        # raise error because dic distances contains only final distances
                        if current_dist < dist[target]:
                            raise ValueError('Contradictory paths found:',
                                             'negative weights?')
                # either node node been seen before or the current distance is smaller than the 
                # proposed distance in seen[target]:
                elif target not in seen or current_dist < seen[target]:
                    # update the seen distance
                    seen[target] = current_dist
                    # push it onto the heap so that we will look at its descendants later
                    push(fringe, (current_dist, next(c), target))
                    
                    # update the paths till target:
                    if paths is not None:
                        paths[target] = paths[source] + [target]
    if paths is not None:
        return (dist, paths)
    return dist

def dijkstra_with_time(G, first_source, INPUT_TIME, paths=None, last_target=None):
    
    G_succ = G.succ if G.is_directed() else G.adj
    
    paths = {first_source: [first_source]}
    e_paths = {first_source: []}

    push = heappush
    pop = heappop
    dist = {}  # dictionary of final distances
    
    # dictionnary of wthether it's the first time a node is visited
    seen = {first_source: INPUT_TIME}
    
    
    c = count()
    fringe = []  # use heapq with (distance,label) tuples
    
    #push(fringe, (0, next(c), first_source))
    push(fringe, (INPUT_TIME, next(c), first_source))
    
    while fringe:
        #take the node to look at: 
        (d, _, source) = pop(fringe)
        #print('Looking at node: '+source)
        
        # check if node has already been looked at: 
        if source in dist:
            continue  # already searched this node.
        
        # update the distance of the node
        dist[source] = d
        
        #stop if the node we look at is the target obviously
        if source == last_target:
            break
        
        # Look at all direct descendents from the source node: 
        for target, edges in G_succ[source].items():
            # Because it's a multigraph, need to look at all edges between two nodes:
            for edge_id in edges:
                dep_time_edge = get_time(G, source, target, edge_id)
                if dep_time_edge == -1:
                    dep_time_edge = d
                
                # Note: checker si chgt de ligne faire +2min
                if dep_time_edge >= dist[source]:
                    # Get the duration between two nodes:
                    duration_cost = get_weight(G, source, target, edge_id)

                    if duration_cost is None:
                            continue

                    # Add the weight to the current distance of a node
                    current_dist = dep_time_edge + duration_cost

                    # if target has already been visited once and has a final distance:
                    if target in dist:
                            # if we find a distance smaller than the actual distance in dic
                            # raise error because dic distances contains only final distances
                            if current_dist < dist[target]:
                                raise ValueError('Contradictory paths found:',
                                                 'negative weights?')

                    # either node node been seen before or the current distance is smaller than the 
                    # proposed distance in seen[target]:
                    elif target not in seen or current_dist < seen[target]:
                        # update the seen distance
                        seen[target] = current_dist
                        # push it onto the heap so that we will look at its descendants later
                        push(fringe, (current_dist, next(c), target))

                        # update the paths till target:
                        if paths is not None:
                            #paths[target] = paths[source] + [target]
                            e_paths[target] = e_paths[source] + [(source, target, {'departure_time':dep_time_edge, 'duration':duration_cost})]
                            
    if  last_target not in e_paths:
        print('Error: No paths to the source')
        return (0, [])
        #raise ValueError('No paths exist to the source') 
    
    if paths is not None:
        #return (dist, paths, e_paths)
        #return (dist, e_paths)<
        
        #for _ in range(100):
            #Validate path
            #for e in path:
                #sample_gaussian
                #check if miss connection
            #If > 0 connection missed, path missed
        # if 95% must have missed < 5 path
        # if path not validated -> starts with smaller threshold 
        
        arrival_string = minute_to_string(dist[last_target])
        best_path = e_paths[last_target]
        departure_string = minute_to_string(best_path[0][2]['departure_time'])
        stations_id = map(lambda x: x[0], best_path)
        
        nodes_data = graph.nodes(data=True)
        print('Going from {} ({}) to {} ({}), arrival at {}'.format(nodes_data[first_source]['name'],
                                                                   first_source,
                                                                   nodes_data[last_target]['name'],
                                                                   last_target, 'planned_arrival'))
        for s, t, info in best_path:
            print('\t{} ({}) -> {} ({}), {:.2f}\' departure at {}'.format(nodes_data[s]['name'], s,
                                                                          nodes_data[t]['name'], t,
                                                                          info['duration'],
                                                                          minute_to_string(info['departure_time'])))
        
        return (dist[last_target], e_paths[last_target])
    return dist

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
%%local
import os
username = os.environ['JUPYTERHUB_USER']

In [4]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

## Choose time of arrival

In [36]:
# format: day_id, "hour:minute:00"
day_id, hour, minute = 4, 12, 5

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Create graph

In [6]:
trips = spark.read.format('orc').load('/data/sbb/timetables/orc/trips/000000_0')
calendar = spark.read.format('orc').load('/data/sbb/timetables/orc/calendar/000000_0')

nodes_df = spark.read.orc("/user/{}/nodes.orc".format(username))
edges_df = spark.read.orc("/user/{}/edges.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
graph = nx.MultiDiGraph()

nodes = nodes_df.rdd.map(lambda r: (r[0], {'name': r['stop_name'],
                                              'lat': r['stop_lat'],
                                              'lon': r['stop_lon']})).collect()

# TODO filtrer les nodes pour que ce soit reachable

graph.add_nodes_from(nodes)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
# Temp for problem of encoding
nodes_data = graph.nodes(data=True)
for n in graph.nodes:
    nodes_data[n]['name'] = nodes_data[n]['name'].replace(u'\xfc', 'u')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
%%local
import pandas as pd
walking_times = pd.read_pickle('pickle_walking_times')

In [18]:
%%send_to_spark -i walking_times -t df -m 20000

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'walking_times' as 'walking_times' to Spark kernel

In [19]:
edges_walking = walking_times.toPandas()
edges_walking['attrs'] = edges_walking.apply(lambda x: {'time': -1, 'duration': x['duration']+2}, axis=1)
edges_walking = list(edges_walking[['source', 'target', 'attrs']].to_numpy())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
MAX_TRIP_DURATION = 2 #duration in hour 

def create_edges_for_trip(edges_df, day_id, hour, minute):
    """
    create_edges_for_trip: constructs edges (and thus trips) that exist in a window of two hours before a given input time
    @input:
    - edges_df: df from which we construct the edges
    - day_id: id of week-day (e.g. wednesday is day id 2, see dictionnary above)
    - hour, minute: time at which we want to arrive somewhere (e.g. 11:30)
    @output: data frame of selected edges
    """
    #select only the trips that occur on that day:
    edges_df= edges_df.join(day_trips(day_id), on='trip_id')
    
    arrival_minute = hour*60+minute
    min_dep_time = arrival_minute - 60*60*MAX_TRIP_DURATION
    
    #keep only those in a window of two hours:
    edges_df = edges_df.filter((col('departure_time') > min_dep_time) & 
                                            (col('arrival_time') <= arrival_minute))

    edges = edges_df.rdd.map(lambda r: (r['stop_id'], r['next_stop'], {'duration': r['trip_duration'],
                                                                       'time': float(r['departure_time'])})).collect()
    
    return edges + edges_walking

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
edges = create_edges_for_trip(edges_df, day_id, hour, minute)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
_ = graph.add_edges_from(edges)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Run algorithm

In [37]:
time, path = dijkstra_with_time(G=graph, first_source='8503000', last_target='8591049', INPUT_TIME=hour*60+minute)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich HB (8503000) to Zurich, Auzelg (8591049), arrival at planned_arrival
	Zurich HB (8503000) -> Zurich, Stampfenbachplatz (8591379), 7.59' departure at 12:05
	Zurich, Stampfenbachplatz (8591379) -> Zurich, Beckenhof (8591071), 11.84' departure at 12:12
	Zurich, Beckenhof (8591071) -> Zurich, Kronenstrasse (8591237), 10.84' departure at 12:24
	Zurich, Kronenstrasse (8591237) -> Zurich, Schaffhauserplatz (8591335), 10.19' departure at 12:35
	Zurich, Schaffhauserplatz (8591335) -> Zurich, Laubiweg (8591246), 10.28' departure at 12:45
	Zurich, Laubiweg (8591246) -> Zurich, Bucheggplatz (8591101), 10.90' departure at 12:55
	Zurich, Bucheggplatz (8591101) -> Zurich, Radiostudio (8591307), 10.44' departure at 13:06
	Zurich, Radiostudio (8591307) -> Zurich, Bad Allenmoos (8591053), 11.55' departure at 13:17
	Zurich, Bad Allenmoos (8591053) -> Zurich, Regensbergbrucke (8591314), 8.89' departure at 13:28
	Zurich, Regensbergbrucke (8591314) -> Zurich Oerlikon (8503006:0:6), 11.99' 