In [1]:
%%configure
{"conf": {
    "spark.app.name": "dslab-group_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6966,application_1589299642358_1460,pyspark,idle,Link,Link,
6985,application_1589299642358_1479,pyspark,busy,Link,Link,
6990,application_1589299642358_1484,pyspark,idle,Link,Link,
6993,application_1589299642358_1487,pyspark,busy,Link,Link,
6995,application_1589299642358_1489,pyspark,idle,Link,Link,
6998,application_1589299642358_1492,pyspark,idle,Link,Link,
7000,application_1589299642358_1494,pyspark,idle,Link,Link,
7002,application_1589299642358_1496,pyspark,idle,Link,Link,
7003,application_1589299642358_1497,pyspark,busy,Link,Link,
7004,application_1589299642358_1498,pyspark,idle,Link,Link,


## Imports and helper functions

In [16]:
import networkx as nx
import numpy as np
from scipy import stats
from heapq import heappush, heappop
from itertools import count
from pyspark.sql.functions import col

MAX_TRIP_DURATION = 2 #duration in hour 

days_dict = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday'}
def day_trips(*day_ids):
    """
    day_trips: gives the trip_ids that operate on certain days
    input: a variable number of day ids
    output:s spark dataframe with trip_ids
    
    """
    days = [days_dict[day_id] for day_id in day_ids]
    where_clause = " and ".join(days)

    day_services = calendar.where(where_clause).select('service_id')
    return day_services.join(trips, on='service_id').select('trip_id')

def minute_to_string(m):
    hour, minute = m // 60, m - 60*(m//60)
    time_string = '{:02}:{:02}'.format(int(hour), int(minute))
    
    return time_string

def string_to_minute(s):
    h, m, _ = s.split(':')
    h,m = int(h), int(m)
    
    return h*60+m

def get_time(graph, source, target, j):
    attr = graph.edges[(source, target, j)]
    return attr['time']

def get_weight(graph, source, target, j):
    attr = graph.edges[(source, target, j)]
    return attr['duration']

def normal_dijkstra(G, first_source, paths=None, cutoff=None, last_target=None):
    
    G_succ = G.succ if G.is_directed() else G.adj
    paths = {first_source: [first_source]}

    push = heappush
    pop = heappop
    dist = {}  # dictionary of final distances
    
    # dictionnary of wthether it's the first time a node is visited
    seen = {first_source: 0}

    c = count()
    fringe = []  # use heapq with (distance,label) tuples
    push(fringe, (0, next(c), first_source))
    
    while fringe:
        #take the node to look at: 
        (d, _, source) = pop(fringe)
        
        # check if node has already been looked at: 
        if source in dist:
            continue  # already searched this node.
        
        # update the distance of the node
        dist[source] = d
        
        #stop if the node we look at is the target obviously
        if source == last_target:
            break
            
        # Look at all direct descendents from the source node: 
        for target, edges in G_succ[source].items():
            # Because it's a multigraph, need to look at all edges between two nodes:
            for edge_id in edges:
                
                # Get the duration between two nodes:
                cost = graph.get_edge_data(source, target, edge_id)['duration']
                
                if cost is None:
                        continue
                
                # Add the weight to the current distance of a node
                current_dist = dist[source] + cost
                
                # if target has already been visited once and has a final distance:
                if target in dist:
                        # if we find a distance smaller than the actual distance in dic
                        # raise error because dic distances contains only final distances
                        if current_dist < dist[target]:
                            raise ValueError('Contradictory paths found:',
                                             'negative weights?')
                # either node node been seen before or the current distance is smaller than the 
                # proposed distance in seen[target]:
                elif target not in seen or current_dist < seen[target]:
                    # update the seen distance
                    seen[target] = current_dist
                    # push it onto the heap so that we will look at its descendants later
                    push(fringe, (current_dist, next(c), target))
                    
                    # update the paths till target:
                    if paths is not None:
                        paths[target] = paths[source] + [target]
    if paths is not None:
        return (dist, paths)
    return dist

def validate_path(path, confidence, graph):
    #for _ in range(100):
        #Validate path
        #for e in path:
            #sample_gaussian
            #check if miss connection
        #If > 0 connection missed, path missed
    # if 95% must have missed < 5 path
    # if path not validated -> starts with smaller threshold 
    return True

def compute_delay_uncertainty(last_edge, confidence, null_gauss_count):
    if confidence != None:
        if 'mean' not in last_edge or 'std' not in last_edge:
            null_gauss_count += 1
            return 0, null_gauss_count
        
        mean, std = last_edge['mean'], last_edge['std']
        num_sample = 50
        t_quantile = stats.t(df=num_sample-1).ppf(confidence)
        mean_deviation = t_quantile * std / np.sqrt(num_sample)
        delay = mean_deviation
        print("The delay induced for {}% confidence for edge ~ N({}, {}) is {}"
              .format(int(confidence*100), mean, std, delay))
    else:
        delay = 0
        
    return delay, null_gauss_count

def dijkstra_with_time(G, first_source, arrival_time, last_target=None, confidence=None, paths=None):
    while True:
        departure_time = arrival_time - MAX_TRIP_DURATION*60

        G_succ = G.succ if G.is_directed() else G.adj

        paths = {first_source: [first_source]}
        e_paths = {first_source: []}

        push = heappush
        pop = heappop
        dist = {}  # dictionary of final distances

        # dictionnary of wthether it's the first time a node is visited
        seen = {first_source: departure_time}


        c = count()
        fringe = []  # use heapq with (distance,label) tuples

        #push(fringe, (0, next(c), first_source))
        push(fringe, (departure_time, next(c), first_source))

        null_gauss_count = 0
        while fringe:
            #take the node to look at: 
            (d, _, source) = pop(fringe)
            #print('Looking at node: '+source)

            # check if node has already been looked at: 
            if source in dist:
                continue  # already searched this node

            # update the distance of the node
            dist[source] = d

            #stop if the node we look at is the target obviously
            if source == last_target:
                break

                
            # Look at all direct descendents from the source node: 
            for target, edges in G_succ[source].items():
                # Because it's a multigraph, need to look at all edges between two nodes:
                for edge_id in edges:
                    # Check if walking edge
                    dep_time_edge = graph.get_edge_data(source, target, edge_id)['time']
                    if dep_time_edge == -1:
                        walking_edge = True
                        current_trip_id = None
                        dep_time_edge = d
                    else:
                        walking_edge = False
                        current_trip_id = graph.get_edge_data(source, target, edge_id)['trip_id']
                        
                    
                    if dep_time_edge >= dist[source]:
                        # Check if edge is feasible (also accoring to confidence)
                        if len(e_paths[source]) >= 1 and not e_paths[source][-1][2]['walk']:
                            last_edge_info = e_paths[source][-1][2]
                            last_delay, null_gauss_count = compute_delay_uncertainty(last_edge_info, confidence, null_gauss_count)
                            # If we make a transport-walk change, add delay to walk time
                            if walking_edge:
                                dep_time_edge += last_delay
                            else:
                                # If we make a transport-transport change, check if we have time
                                if current_trip_id != last_edge_info['trip_id'] and dep_time_edge > dist[source] + 2 + last_delay:
                                    continue

                        # Get the duration between two nodes:
                        duration_cost = graph.get_edge_data(source, target, edge_id)['duration']
                        if duration_cost is None:
                                continue

                        # Add the weight to the current distance of a node
                        current_dist = dep_time_edge + duration_cost

                        # if target has already been visited once and has a final distance:
                        if target in dist:
                                # if we find a distance smaller than the actual distance in dic
                                # raise error because dic distances contains only final distances
                                if current_dist < dist[target]:
                                    raise ValueError('Contradictory paths found:',
                                                     'negative weights?')

                        # either node has been seen before or the current distance is smaller than the 
                        # proposed distance in seen[target]:
                        elif target not in seen or current_dist < seen[target]:
                            # update the seen distance
                            seen[target] = current_dist
                            # push it onto the heap so that we will look at its descendants later
                            push(fringe, (current_dist, next(c), target))

                            # update the paths till target:
                            if paths is not None:
                                #paths[target] = paths[source] + [target]
                                if walking_edge:
                                    e_paths[target] = e_paths[source] + [(source, target, 
                                                                          {'departure_time':dep_time_edge, 
                                                                           'duration':duration_cost, 
                                                                           'walk': True})]
                                else:
                                    e_paths[target] = e_paths[source] + [(source, target, 
                                                                          {'departure_time':dep_time_edge,
                                                                           'duration':duration_cost,
                                                                           'walk': False,
                                                                           'trip_id': current_trip_id})]

        # No path exists
        if  last_target not in e_paths:
            print('Error: No paths to the source')
            return (0, [])

        
        # Validation
        if confidence == None or validate_path(e_paths[last_target], confidence, graph):
            break
        else:
            confidence += confidence_step
            
    # Path validated
    if paths is not None:
        arrival_string = minute_to_string(dist[last_target])
        best_path = e_paths[last_target]
        departure_string = minute_to_string(best_path[0][2]['departure_time'])
        
        nodes_data = graph.nodes(data=True)
        print('Going from {} ({}) to {} ({}) in {:.2f} minutes, departure at {}'.format(nodes_data[first_source]['name'],
                                                                                      first_source,
                                                                                      nodes_data[last_target]['name'],
                                                                                      last_target, 
                                                                                      dist[last_target] - departure_time,
                                                                                      minute_to_string(departure_time)))
        
        last_trip_id = False
        for s, t, info in best_path:
            if info['walk']:
                last_trip_id = False
                print('\t{} ({}) -> {} ({}), {:.2f}\' departure at {} on foot'.format(nodes_data[s]['name'], s,
                                                                          nodes_data[t]['name'], t,
                                                                          info['duration'],
                                                                          minute_to_string(info['departure_time'])))
            else:
                if last_trip_id and last_trip_id == info['trip_id']:
                    print('\t{} ({}) -> {} ({}), {:.2f}\' departure at {} (no change)'.format(nodes_data[s]['name'], s,
                                                                          nodes_data[t]['name'], t,
                                                                          info['duration'],
                                                                          minute_to_string(info['departure_time'])))
                else:
                    last_trip_id = info['trip_id']
                    print('\t{} ({}) -> {} ({}), {:.2f}\' departure at {}'.format(nodes_data[s]['name'], s,
                                                                              nodes_data[t]['name'], t,
                                                                              info['duration'],
                                                                              minute_to_string(info['departure_time'])))
        
        return (dist[last_target], e_paths[last_target], null_gauss_count)
    return dist

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
%%local
import os
import pandas as pd
username = os.environ['JUPYTERHUB_USER']

In [4]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

## Load graph data

In [5]:
trips = spark.read.format('orc').load('/data/sbb/timetables/orc/trips/000000_0')
calendar = spark.read.format('orc').load('/data/sbb/timetables/orc/calendar/000000_0')

nodes_df = spark.read.orc("/user/{}/nodes.orc".format(username))
edges_df = spark.read.orc("/user/{}/edges_with_mean_and_std_sec.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
nodes = nodes_df.rdd.map(lambda r: (r[0], {'name': r['stop_name'],
                                              'lat': r['stop_lat'],
                                              'lon': r['stop_lon']})).collect()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
%%local
walking_times = pd.read_pickle('walking_edges.pickle')

In [8]:
%%send_to_spark -i walking_times -t df -m 20000

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'walking_times' as 'walking_times' to Spark kernel

In [9]:
#reverse edges
#edges_walking = (walking_times.withColumnRenamed('source', 'temp')
#                 .withColumnRenamed('target', 'source')
#                 .withColumnRenamed('temp', 'target').toPandas())
edges_walking = walking_times.toPandas()
edges_walking['attrs'] = edges_walking.apply(lambda x: {'time': -1, 'duration': x['walk_duration']}, axis=1)
edges_walking = list(edges_walking[['source', 'target', 'attrs']].to_numpy())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Choose time of arrival

In [10]:
day_id, arrival_hour, arrival_minute = 4, 12, 30

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Create graph

In [11]:
def create_edges_for_trip(edges_df, day_id, arrival_time):
    """
    create_edges_for_trip: constructs edges (and thus trips) that exist in a window of two hours before a given input time
    @input:
    - edges_df: df from which we construct the edges
    - day_id: id of week-day (e.g. wednesday is day id 2, see dictionnary above)
    - hour, minute: time at which we want to arrive somewhere (e.g. 11:30)
    @output: data frame of selected edges
    """
    #select only the trips that occur on that day:
    edges_df= edges_df.join(day_trips(day_id), on='trip_id')
    
    min_dep_time = arrival_time - 60*MAX_TRIP_DURATION
    
    #keep only those in a window of two hours:
    edges_df = edges_df.filter((col('departure_time') > min_dep_time) & 
                                            (col('arrival_time') <= arrival_time))
    
    #reverse edges
    #edges_df = (edges_df.withColumnRenamed('next_stop', 'temp')
    #            .withColumnRenamed('stop_id', 'next_stop')
    #            .withColumnRenamed('temp', 'stop_id'))

    edges = edges_df.rdd.map(lambda r: (r['stop_id'], r['next_stop'], {'duration': r['trip_duration'],
                                                                       'time': float(r['departure_time']),
                                                                       'trip_id': r['trip_id'],
                                                                       'mean': r['mean'],
                                                                       'std': r['std']})).collect()
    print(edges[0])
    
    return edges + edges_walking

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
edges = create_edges_for_trip(edges_df, day_id, arrival_hour*60+arrival_minute)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(u'8573195', u'8583890', {'duration': 2.0, 'std': 2.627700220351667, 'time': 643.0, 'trip_id': u'11.TA.1-217-j19-1.3.H', 'mean': 1.380964467005})

In [13]:
graph = nx.MultiDiGraph()
graph.add_nodes_from(nodes)
graph.add_edges_from(edges)

old_number_of_nodes = graph.number_of_nodes()
# Remove unreachable nodes
dists, paths = normal_dijkstra(graph, '8503000')
not_reachable = set(graph.nodes) - set(dists.keys())
_ = graph.remove_nodes_from(list(not_reachable))
print('{} nodes removed'.format(old_number_of_nodes - graph.number_of_nodes()))

# Temp for problem of encoding
import unicodedata
nodes_data = graph.nodes(data=True)
for n in graph.nodes:
    nodes_data[n]['name'] = unicodedata.normalize('NFKD', nodes_data[n]['name']).encode('ascii','ignore')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

18 nodes removed

In [43]:
# Proportion of null mean or std in non-walking edges
(len(filter(lambda x: x[2]['mean'] == None or x[2]['std'] == None, filter(lambda x: 'mean' in x[2] and 'std' in x[2], graph.edges(data=True))))
 / float(len(filter(lambda x: 'mean' in x[2] and 'std' in x[2], graph.edges(data=True)))))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.046632828786368166

## Run algorithm

In [14]:
time, path, null_gauss_count = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049')
null_gauss_count

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich HB (8503000) to Zurich, Auzelg (8591049) in 29.00 minutes, departure at 10:30
	Zurich HB (8503000) -> Zurich HB (8503000:0:41/42), 2.14' departure at 10:30 on foot
	Zurich HB (8503000:0:41/42) -> Zurich Hardbrucke (8503020:0:3), 2.00' departure at 10:37
	Zurich Hardbrucke (8503020:0:3) -> Zurich Oerlikon (8503006:0:8), 5.00' departure at 10:39 (no change)
	Zurich Oerlikon (8503006:0:8) -> Zurich Oerlikon, Bahnhof (8580449), 3.05' departure at 10:44 on foot
	Zurich Oerlikon, Bahnhof (8580449) -> Zurich Oerlikon, Bahnhof Ost (8591063), 1.00' departure at 10:48
	Zurich Oerlikon, Bahnhof Ost (8591063) -> Zurich, Leutschenbach (8591256), 2.00' departure at 10:50
	Zurich, Leutschenbach (8591256) -> Zurich, Oerlikerhus (8591294), 0.00' departure at 10:52 (no change)
	Zurich, Oerlikerhus (8591294) -> Glattpark, Glattpark (8591830), 2.00' departure at 10:52 (no change)
	Glattpark, Glattpark (8591830) -> Zurich, Fernsehstudio (8591128), 1.00' departure at 10:56
	Zurich, Fernseh

In [17]:
time, path, null_gauss_count = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049', confidence=0.9)
null_gauss_count

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich HB (8503000) to Zurich, Auzelg (8591049) in 29.00 minutes, departure at 10:30
	Zurich HB (8503000) -> Zurich HB (8503000:0:41/42), 2.14' departure at 10:30 on foot
	Zurich HB (8503000:0:41/42) -> Zurich Hardbrucke (8503020:0:3), 2.00' departure at 10:37
	Zurich Hardbrucke (8503020:0:3) -> Zurich Oerlikon (8503006:0:8), 5.00' departure at 10:39 (no change)
	Zurich Oerlikon (8503006:0:8) -> Zurich Oerlikon, Bahnhof (8580449), 3.05' departure at 10:44 on foot
	Zurich Oerlikon, Bahnhof (8580449) -> Zurich Oerlikon, Bahnhof Ost (8591063), 1.00' departure at 10:48
	Zurich Oerlikon, Bahnhof Ost (8591063) -> Zurich, Leutschenbach (8591256), 2.00' departure at 10:50
	Zurich, Leutschenbach (8591256) -> Zurich, Oerlikerhus (8591294), 0.00' departure at 10:52 (no change)
	Zurich, Oerlikerhus (8591294) -> Glattpark, Glattpark (8591830), 2.00' departure at 10:52 (no change)
	Glattpark, Glattpark (8591830) -> Zurich, Fernsehstudio (8591128), 1.00' departure at 10:56
	Zurich, Fernseh

In [18]:
print(filter(lambda x: 'Triemli' in x[1]['name'], graph.nodes(data=True)))
print(filter(lambda x: 'Altstetten' in x[1]['name'], graph.nodes(data=True)))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[(u'8503054', {'lat': 47.3649987828854, 'lon': 8.49519696319687, 'name': 'Zurich Triemli'}), (u'8503610', {'lat': 47.3681504693771, 'lon': 8.49537662625372, 'name': 'Zurich, Triemli'}), (u'8591401', {'lat': 47.3663860348906, 'lon': 8.4967690149443, 'name': 'Zurich, Triemlispital'})]
[(u'8503001:0:2', {'lat': 47.3917881971539, 'lon': 8.48893570566569, 'name': 'Zurich Altstetten'}), (u'8503001:0:3', {'lat': 47.3915449394745, 'lon': 8.48893570566569, 'name': 'Zurich Altstetten'}), (u'8503001:0:4', {'lat': 47.3916057539996, 'lon': 8.48893570566569, 'name': 'Zurich Altstetten'}), (u'8503001:0:6', {'lat': 47.3916665684545, 'lon': 8.48893570566569, 'name': 'Zurich Altstetten'}), (u'8503001:0:7', {'lat': 47.3917273828393, 'lon': 8.48893570566569, 'name': 'Zurich Altstetten'}), (u'8503001P', {'lat': 47.3914841248792, 'lon': 8.48893570566569, 'name': 'Zurich Altstetten'}), (u'8591057', {'lat': 47.392067942097, 'lon': 8.48990588617267, 'name': 'Zurich Altstetten, Bahnhof N'}), (u'8591056', {'lat'

In [19]:
time, path, null_gauss_count = dijkstra_with_time(graph, '8503610', arrival_hour*60+arrival_minute, last_target='8591057', confidence=0.95)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich, Triemli (8503610) to Zurich Altstetten, Bahnhof N (8591057) in 17.14 minutes, departure at 10:30
	Zurich, Triemli (8503610) -> Zurich, In der Ey (8591214), 1.00' departure at 10:31
	Zurich, In der Ey (8591214) -> Zurich, Goldackerweg (8591163), 1.00' departure at 10:32 (no change)
	Zurich, Goldackerweg (8591163) -> Zurich, Albisrieden (8591036), 2.00' departure at 10:33 (no change)
	Zurich, Albisrieden (8591036) -> Zurich, Albisriederdorfli (8591037), 0.00' departure at 10:35 (no change)
	Zurich, Albisriederdorfli (8591037) -> Zurich, Untermoosstrasse (8591408), 2.00' departure at 10:35 (no change)
	Zurich, Untermoosstrasse (8591408) -> Zurich, Rautistrasse (8591311), 1.00' departure at 10:37 (no change)
	Zurich, Rautistrasse (8591311) -> Zurich, Lindenplatz (8591258), 2.00' departure at 10:38 (no change)
	Zurich, Lindenplatz (8591258) -> Zurich, Bristenstrasse (8591097), 1.00' departure at 10:40 (no change)
	Zurich, Bristenstrasse (8591097) -> Zurich Altstetten, Bah

In [45]:
graph.get_edge_data('8591163', '8591036')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{0: {'duration': 2.0, 'std': 2.04106382892, 'time': 738.0, 'trip_id': u'1227.TA.26-80-j19-1.8.R', 'mean': 1.2843136156550001}, 1: {'duration': 2.0, 'std': 2.04106382892, 'time': 723.0, 'trip_id': u'1236.TA.26-80-j19-1.8.R', 'mean': 1.2843136156550001}, 2: {'duration': 2.0, 'std': 1.5378574563483334, 'time': 708.0, 'trip_id': u'1250.TA.26-80-j19-1.8.R', 'mean': 1.2908743475016666}, 3: {'duration': 2.0, 'std': 1.5378574563483334, 'time': 693.0, 'trip_id': u'1257.TA.26-80-j19-1.8.R', 'mean': 1.2908743475016666}, 4: {'duration': 2.0, 'std': 1.5378574563483334, 'time': 678.0, 'trip_id': u'1272.TA.26-80-j19-1.8.R', 'mean': 1.2908743475016666}, 5: {'duration': 2.0, 'std': 1.5378574563483334, 'time': 663.0, 'trip_id': u'1286.TA.26-80-j19-1.8.R', 'mean': 1.2908743475016666}, 6: {'duration': 2.0, 'std': 1.5585312253966668, 'time': 648.0, 'trip_id': u'1305.TA.26-80-j19-1.8.R', 'mean': 1.2626964433416668}, 7: {'duration': 2.0, 'std': 1.5585312253966668, 'time': 633.0, 'trip_id': u'1317.TA.26-80-j1