In [1]:
%%configure
{"conf": {
    "spark.app.name": "dslab-group_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6849,application_1589299642358_1346,pyspark,idle,Link,Link,
6852,application_1589299642358_1349,pyspark,idle,Link,Link,
6858,application_1589299642358_1352,pyspark,idle,Link,Link,
6861,application_1589299642358_1355,pyspark,idle,Link,Link,
6864,application_1589299642358_1358,pyspark,busy,Link,Link,
6865,application_1589299642358_1359,pyspark,idle,Link,Link,


## Imports and helper functions

In [2]:
import networkx as nx
from heapq import heappush, heappop
from itertools import count
from pyspark.sql.functions import col

MAX_TRIP_DURATION = 2 #duration in hour 

days_dict = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday'}
def day_trips(*day_ids):
    """
    day_trips: gives the trip_ids that operate on certain days
    input: a variable number of day ids
    output:s spark dataframe with trip_ids
    
    """
    days = [days_dict[day_id] for day_id in day_ids]
    where_clause = " and ".join(days)

    day_services = calendar.where(where_clause).select('service_id')
    return day_services.join(trips, on='service_id').select('trip_id')

def minute_to_string(m):
    hour, minute = m // 60, m - 60*(m//60)
    time_string = '{:02}:{:02}'.format(int(hour), int(minute))
    
    return time_string

def string_to_minute(s):
    h, m, _ = s.split(':')
    h,m = int(h), int(m)
    
    return h*60+m

def get_time(graph, source, target, j):
    attr = graph.edges[(source, target, j)]
    return attr['time']

def get_weight(graph, source, target, j):
    attr = graph.edges[(source, target, j)]
    return attr['duration']

def normal_dijkstra(G, first_source, paths=None, cutoff=None, last_target=None):
    
    G_succ = G.succ if G.is_directed() else G.adj
    paths = {first_source: [first_source]}

    push = heappush
    pop = heappop
    dist = {}  # dictionary of final distances
    
    # dictionnary of wthether it's the first time a node is visited
    seen = {first_source: 0}

    c = count()
    fringe = []  # use heapq with (distance,label) tuples
    push(fringe, (0, next(c), first_source))
    
    while fringe:
        #take the node to look at: 
        (d, _, source) = pop(fringe)
        
        # check if node has already been looked at: 
        if source in dist:
            continue  # already searched this node.
        
        # update the distance of the node
        dist[source] = d
        
        #stop if the node we look at is the target obviously
        if source == last_target:
            break
            
        # Look at all direct descendents from the source node: 
        for target, edges in G_succ[source].items():
            # Because it's a multigraph, need to look at all edges between two nodes:
            for edge_id in edges:
                
                # Get the duration between two nodes:
                cost = graph.get_edge_data(source, target, edge_id)['duration']
                
                if cost is None:
                        continue
                
                # Add the weight to the current distance of a node
                current_dist = dist[source] + cost
                
                # if target has already been visited once and has a final distance:
                if target in dist:
                        # if we find a distance smaller than the actual distance in dic
                        # raise error because dic distances contains only final distances
                        if current_dist < dist[target]:
                            raise ValueError('Contradictory paths found:',
                                             'negative weights?')
                # either node node been seen before or the current distance is smaller than the 
                # proposed distance in seen[target]:
                elif target not in seen or current_dist < seen[target]:
                    # update the seen distance
                    seen[target] = current_dist
                    # push it onto the heap so that we will look at its descendants later
                    push(fringe, (current_dist, next(c), target))
                    
                    # update the paths till target:
                    if paths is not None:
                        paths[target] = paths[source] + [target]
    if paths is not None:
        return (dist, paths)
    return dist

def dijkstra_with_time(G, first_source, arrival_time, paths=None, last_target=None):
    departure_time = arrival_time - MAX_TRIP_DURATION*60
    
    G_succ = G.succ if G.is_directed() else G.adj
    
    paths = {first_source: [first_source]}
    e_paths = {first_source: []}

    push = heappush
    pop = heappop
    dist = {}  # dictionary of final distances
    
    # dictionnary of wthether it's the first time a node is visited
    seen = {first_source: departure_time}
    
    
    c = count()
    fringe = []  # use heapq with (distance,label) tuples
    
    #push(fringe, (0, next(c), first_source))
    push(fringe, (departure_time, next(c), first_source))
    
    while fringe:
        #take the node to look at: 
        (d, _, source) = pop(fringe)
        #print('Looking at node: '+source)
        
        # check if node has already been looked at: 
        if source in dist:
            continue  # already searched this node.
        
        # update the distance of the node
        dist[source] = d
        
        #stop if the node we look at is the target obviously
        if source == last_target:
            break
        
        # Look at all direct descendents from the source node: 
        for target, edges in G_succ[source].items():
            # Because it's a multigraph, need to look at all edges between two nodes:
            for edge_id in edges:
                #print(graph.get_edge_data(source, target, key=edge_id))
                dep_time_edge = graph.get_edge_data(source, target, edge_id)['time']
                if dep_time_edge == -1:
                    walking_edge = True
                    dep_time_edge = d
                else:
                    walking_edge = False
                
                if dep_time_edge >= dist[source]:
                    # If change of line and less than 2 min to change -> skip edge
                    if walking_edge:
                        current_trip_id = None
                    else:
                        current_trip_id = graph.get_edge_data(source, target, edge_id)['trip_id']
                    if not walking_edge and len(e_paths[source]) >= 1:
                        last_edge_info = e_paths[source][-1][2]
                        if current_trip_id != last_edge_info['trip_id'] and dep_time_edge < dist[source] + 2:
                            continue
                        
                    # Get the duration between two nodes:
                    duration_cost = graph.get_edge_data(source, target, edge_id)['duration']
                    if duration_cost is None:
                            continue

                    # Add the weight to the current distance of a node
                    current_dist = dep_time_edge + duration_cost #arrival_time - (dep_time_edge + duration_cost)
                    

                    # if target has already been visited once and has a final distance:
                    if target in dist:
                            # if we find a distance smaller than the actual distance in dic
                            # raise error because dic distances contains only final distances
                            if current_dist < dist[target]:
                                raise ValueError('Contradictory paths found:',
                                                 'negative weights?')

                    # either node has been seen before or the current distance is smaller than the 
                    # proposed distance in seen[target]:
                    elif target not in seen or current_dist < seen[target]:
                        # update the seen distance
                        seen[target] = current_dist
                        # push it onto the heap so that we will look at its descendants later
                        push(fringe, (current_dist, next(c), target))

                        # update the paths till target:
                        if paths is not None:
                            #paths[target] = paths[source] + [target]
                            if walking_edge:
                                e_paths[target] = e_paths[source] + [(source, target, 
                                                                      {'departure_time':dep_time_edge, 
                                                                       'duration':duration_cost, 
                                                                       'walk': True,
                                                                       'trip_id': current_trip_id})]
                            else:
                                e_paths[target] = e_paths[source] + [(source, target, 
                                                                      {'departure_time':dep_time_edge,
                                                                       'duration':duration_cost,
                                                                       'walk': False,
                                                                       'trip_id': current_trip_id})]
                            
    if  last_target not in e_paths:
        print('Error: No paths to the source')
        return (0, [])
        #raise ValueError('No paths exist to the source') 
    
    if paths is not None:
        #return (dist, paths, e_paths)
        #return (dist, e_paths)<
        
        #for _ in range(100):
            #Validate path
            #for e in path:
                #sample_gaussian
                #check if miss connection
            #If > 0 connection missed, path missed
        # if 95% must have missed < 5 path
        # if path not validated -> starts with smaller threshold 
        
        arrival_string = minute_to_string(dist[last_target])
        best_path = e_paths[last_target]
        departure_string = minute_to_string(best_path[0][2]['departure_time'])
        
        nodes_data = graph.nodes(data=True)
        print('Going from {} ({}) to {} ({}), arrival at {}'.format(nodes_data[first_source]['name'],
                                                                   first_source,
                                                                   nodes_data[last_target]['name'],
                                                                   last_target, 'planned_arrival'))
        
        last_trip_id = False
        for s, t, info in best_path:
            if info['walk']:
                last_trip_id = False
                print('\t{} ({}) -> {} ({}), {:.2f}\' departure at {} on foot'.format(nodes_data[s]['name'], s,
                                                                          nodes_data[t]['name'], t,
                                                                          info['duration'],
                                                                          minute_to_string(info['departure_time'])))
            else:
                if last_trip_id and last_trip_id == info['trip_id']:
                    print('\t{} ({}) -> {} ({}), {:.2f}\' departure at {} (no change)'.format(nodes_data[s]['name'], s,
                                                                          nodes_data[t]['name'], t,
                                                                          info['duration'],
                                                                          minute_to_string(info['departure_time'])))
                else:
                    last_trip_id = info['trip_id']
                    print('\t{} ({}) -> {} ({}), {:.2f}\' departure at {}'.format(nodes_data[s]['name'], s,
                                                                              nodes_data[t]['name'], t,
                                                                              info['duration'],
                                                                              minute_to_string(info['departure_time'])))
        
        return (dist[last_target], e_paths[last_target])
    return dist

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6866,application_1589299642358_1360,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
%%local
import os
import pandas as pd
username = os.environ['JUPYTERHUB_USER']

In [4]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

## Load graph data

In [5]:
trips = spark.read.format('orc').load('/data/sbb/timetables/orc/trips/000000_0')
calendar = spark.read.format('orc').load('/data/sbb/timetables/orc/calendar/000000_0')

nodes_df = spark.read.orc("/user/{}/nodes.orc".format(username))
edges_df = spark.read.orc("/user/{}/edges_with_mean_std.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
nodes = nodes_df.rdd.map(lambda r: (r[0], {'name': r['stop_name'],
                                              'lat': r['stop_lat'],
                                              'lon': r['stop_lon']})).collect()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
%%local
walking_times = pd.read_pickle('walking_edges.pickle')

In [8]:
%%send_to_spark -i walking_times -t df -m 20000

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'walking_times' as 'walking_times' to Spark kernel

In [9]:
#reverse edges
#edges_walking = (walking_times.withColumnRenamed('source', 'temp')
#                 .withColumnRenamed('target', 'source')
#                 .withColumnRenamed('temp', 'target').toPandas())
edges_walking = walking_times.toPandas()
edges_walking['attrs'] = edges_walking.apply(lambda x: {'time': -1, 'duration': x['walk_duration']}, axis=1)
edges_walking = list(edges_walking[['source', 'target', 'attrs']].to_numpy())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Choose time of arrival

In [10]:
day_id, arrival_hour, arrival_minute = 4, 12, 30

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Create graph

In [13]:
def create_edges_for_trip(edges_df, day_id, arrival_time):
    """
    create_edges_for_trip: constructs edges (and thus trips) that exist in a window of two hours before a given input time
    @input:
    - edges_df: df from which we construct the edges
    - day_id: id of week-day (e.g. wednesday is day id 2, see dictionnary above)
    - hour, minute: time at which we want to arrive somewhere (e.g. 11:30)
    @output: data frame of selected edges
    """
    #select only the trips that occur on that day:
    edges_df= edges_df.join(day_trips(day_id), on='trip_id')
    
    min_dep_time = arrival_time - 60*MAX_TRIP_DURATION
    
    #keep only those in a window of two hours:
    edges_df = edges_df.filter((col('departure_time') > min_dep_time) & 
                                            (col('arrival_time') <= arrival_time))
    
    #reverse edges
    #edges_df = (edges_df.withColumnRenamed('next_stop', 'temp')
    #            .withColumnRenamed('stop_id', 'next_stop')
    #            .withColumnRenamed('temp', 'stop_id'))

    edges = edges_df.rdd.map(lambda r: (r['stop_id'], r['next_stop'], {'duration': r['trip_duration'],
                                                                       'time': float(r['departure_time']),
                                                                       'trip_id': r['trip_id']})).collect()
    
    return edges + edges_walking

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
edges = create_edges_for_trip(edges_df, day_id, arrival_hour*60+arrival_minute)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
graph = nx.MultiDiGraph()
graph.add_nodes_from(nodes)
_ = graph.add_edges_from(edges)

old_number_of_nodes = graph.number_of_nodes()
# Remove unreachable nodes
dists, paths = normal_dijkstra(graph, '8503000')
not_reachable = set(graph.nodes) - set(dists.keys())
_ = graph.remove_nodes_from(list(not_reachable))
print('{} nodes removed'.format(old_number_of_nodes - graph.number_of_nodes()))

# Temp for problem of encoding
nodes_data = graph.nodes(data=True)
for n in graph.nodes:
    nodes_data[n]['name'] = nodes_data[n]['name'].replace(u'\xfc', 'u')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

18 nodes removed

## Run algorithm

In [17]:
time, path = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich HB (8503000) to Zurich, Auzelg (8591049), arrival at planned_arrival
	Zurich HB (8503000) -> Zurich HB (8503000:0:41/42), 2.14' departure at 10:30 on foot
	Zurich HB (8503000:0:41/42) -> Zurich Hardbrucke (8503020:0:3), 2.00' departure at 10:37
	Zurich Hardbrucke (8503020:0:3) -> Zurich Oerlikon (8503006:0:8), 5.00' departure at 10:39 (no change)
	Zurich Oerlikon (8503006:0:8) -> Glattbrugg (8503310:0:3), 2.00' departure at 10:45 (no change)
	Glattbrugg (8503310:0:3) -> Glattbrugg, Bahnhof (8590620), 3.06' departure at 10:47 on foot
	Glattbrugg, Bahnhof (8590620) -> Glattbrugg, Lindberghplatz (8590626), 1.00' departure at 10:53
	Glattbrugg, Lindberghplatz (8590626) -> Glattpark, Glattpark (8591830), 2.00' departure at 10:54 (no change)
	Glattpark, Glattpark (8591830) -> Zurich, Fernsehstudio (8591128), 1.00' departure at 10:56 (no change)
	Zurich, Fernsehstudio (8591128) -> Zurich, Auzelg (8591049), 2.00' departure at 10:57 (no change)