In [1]:
%%configure
{"conf": {
    "spark.app.name": "dslab-group_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7565,application_1589299642358_2060,pyspark,idle,Link,Link,
7585,application_1589299642358_2080,pyspark,idle,Link,Link,
7586,application_1589299642358_2081,pyspark,idle,Link,Link,
7587,application_1589299642358_2082,pyspark,idle,Link,Link,
7598,application_1589299642358_2093,pyspark,idle,Link,Link,
7599,application_1589299642358_2094,pyspark,idle,Link,Link,
7602,application_1589299642358_2097,pyspark,idle,Link,Link,
7603,application_1589299642358_2098,pyspark,idle,Link,Link,
7604,application_1589299642358_2099,pyspark,idle,Link,Link,
7606,application_1589299642358_2101,pyspark,idle,Link,Link,


## Imports and helper functions

In [2]:
import pickle
import json
import networkx as nx
import numpy as np
import pandas as pd
from scipy import stats
from heapq import heappush, heappop
from itertools import count
from pyspark.sql.functions import col

MAX_TRIP_DURATION = 2 #duration in hour 

days_dict = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday'}
def day_trips(*day_ids):
    """
    day_trips: gives the trip_ids that operate on certain days
    input: a variable number of day ids
    output:s spark dataframe with trip_ids
    
    """
    days = [days_dict[day_id] for day_id in day_ids]
    where_clause = " and ".join(days)

    day_services = calendar.where(where_clause).select('service_id')
    return day_services.join(trips, on='service_id').select('trip_id')

def minute_to_string(m):
    hour, minute = m // 60, m - 60*(m//60)
    time_string = '{:02}:{:02}'.format(int(hour), int(minute))
    
    return time_string

def string_to_minute(s):
    h, m, _ = s.split(':')
    h,m = int(h), int(m)
    
    return h*60+m

def normal_dijkstra(G, first_source, paths=None, cutoff=None, last_target=None):
    
    G_succ = G.succ if G.is_directed() else G.adj
    paths = {first_source: [first_source]}

    push = heappush
    pop = heappop
    dist = {}  # dictionary of final distances
    
    # dictionnary of wthether it's the first time a node is visited
    seen = {first_source: 0}

    c = count()
    fringe = []  # use heapq with (distance,label) tuples
    push(fringe, (0, next(c), first_source))
    
    while fringe:
        #take the node to look at: 
        (d, _, source) = pop(fringe)
        
        # check if node has already been looked at: 
        if source in dist:
            continue  # already searched this node.
        
        # update the distance of the node
        dist[source] = d
        
        #stop if the node we look at is the target obviously
        if source == last_target:
            break
            
        # Look at all direct descendents from the source node: 
        for target, edges in G_succ[source].items():
            # Because it's a multigraph, need to look at all edges between two nodes:
            for edge_id in edges:
                
                # Get the duration between two nodes:
                cost = graph.get_edge_data(source, target, edge_id)['duration']
                
                if cost is None:
                        continue
                
                # Add the weight to the current distance of a node
                current_dist = dist[source] + cost
                
                # if target has already been visited once and has a final distance:
                if target in dist:
                        # if we find a distance smaller than the actual distance in dic
                        # raise error because dic distances contains only final distances
                        if current_dist < dist[target]:
                            raise ValueError('Contradictory paths found:',
                                             'negative weights?')
                # either node node been seen before or the current distance is smaller than the 
                # proposed distance in seen[target]:
                elif target not in seen or current_dist < seen[target]:
                    # update the seen distance
                    seen[target] = current_dist
                    # push it onto the heap so that we will look at its descendants later
                    push(fringe, (current_dist, next(c), target))
                    
                    # update the paths till target:
                    if paths is not None:
                        paths[target] = paths[source] + [target]
    if paths is not None:
        return (dist, paths)
    return dist

def validate_path(path, confidence, graph):
    #for _ in range(100):
        #Validate path
        #for e in path:
            #sample_gaussian
            #check if miss connection
        #If > 0 connection missed, path missed
    # if 95% must have missed < 5 path
    # if path not validated -> starts with smaller threshold 
    return True

def compute_delay_uncertainty(mean, std, confidence):
    if confidence != None:
        if mean == None or std == None:
            return 0
        
        num_sample = 50 # how many (source, target, train_id, hour) tuple there is
        t_quantile = stats.t(df=num_sample-1).ppf(confidence)
        mean_deviation = t_quantile * std / np.sqrt(num_sample)
        delay = mean_deviation
    else:
        delay = 0
        
    return delay

def dijkstra_with_time(G, first_source, arrival_time, last_target=None, confidence=None, 
                       confidence_step=0.01, durations_dicts=None, paths=None):
    G = G.copy()
    departure_time = arrival_time - MAX_TRIP_DURATION*60
    while True:
        # Update durations according to confidence
        if confidence != None:
            if durations_dicts == None:
                raise ValueError('You must pass durations_dicts for the confidence.')
            # Load dict with modifications
            if confidence not in durations_dicts:
                edge_and_data_tuple = zip(G.edges(keys=True), 
                              map(lambda x: x[2], G.edges(data=True)))
                edge_and_data_tuple = filter(lambda x: 'mean' in x[1] and 'std' in x[1], edge_and_data_tuple)
                durations_dicts[confidence] = {e: {'duration': data['mean'] + compute_delay_uncertainty(data['mean'], 
                                                                                                        data['std'], 
                                                                                                        confidence)
                                                   if data['mean'] != None and data['std'] != None
                                                   else data['duration']
                                                  } for e, data in edge_and_data_tuple}
            
            # Update graph
            nx.set_edge_attributes(G, durations_dicts[confidence])
        
        
        G_succ = G.succ if G.is_directed() else G.adj

        paths = {first_source: [first_source]}
        e_paths = {first_source: []}

        push = heappush
        pop = heappop
        dist = {}  # dictionary of final distances

        # dictionnary of whether it's the first time a node is visited
        seen = {first_source: departure_time}


        c = count()
        fringe = []  # use heapq with (distance,label) tuples

        #push(fringe, (0, next(c), first_source))
        push(fringe, (departure_time, next(c), first_source))

        while fringe:
            #take the node to look at: 
            (d, _, source) = pop(fringe)
            #print('Looking at node: '+source)

            # check if node has already been looked at: 
            if source in dist:
                continue  # already searched this node

            # update the distance of the node
            dist[source] = d

            #stop if the node we look at is the target obviously
            if source == last_target:
                break

                
            # Look at all direct descendents from the source node: 
            for target, edges in G_succ[source].items():
                # Because it's a multigraph, need to look at all edges between two nodes:
                for edge_id in edges:
                    # Check if walking edge
                    dep_time_edge = G.get_edge_data(source, target, edge_id)['time']
                    if dep_time_edge == -1:
                        walking_edge = True
                        current_trip_id = None
                        dep_time_edge = d
                    else:
                        walking_edge = False
                        current_trip_id = G.get_edge_data(source, target, edge_id)['trip_id']
                        
                    
                    if dep_time_edge >= dist[source]:
                        # Check if edge is feasible (also accoring to confidence)
                        if len(e_paths[source]) >= 1 and not e_paths[source][-1][2]['walk']:
                            last_edge_source, last_edge_target, last_edge_info = e_paths[source][-1]
                            last_delay = compute_delay_uncertainty(last_edge_info['mean'], 
                                                                   last_edge_info['std'], 
                                                                   confidence)
                            # If we make a transport-walk change, add delay to walk time
                            if walking_edge:
                                dep_time_edge += last_delay
                            else:
                                # If we make a transport-transport change, check if we have time
                                if current_trip_id != last_edge_info['trip_id']\
                                and dep_time_edge < dist[source] + 2 + last_delay:
                                    continue

                        # Get the duration between two nodes:
                        duration_cost = G.get_edge_data(source, target, edge_id)['duration']
                        if duration_cost is None:
                                continue

                        # Add the weight to the current distance of a node
                        current_dist = dep_time_edge + duration_cost

                        # if target has already been visited once and has a final distance:
                        if target in dist:
                                # if we find a distance smaller than the actual distance in dic
                                # raise error because dic distances contains only final distances
                                if current_dist < dist[target]:
                                    raise ValueError('Contradictory paths found:',
                                                     'negative weights?')

                        # either node has been seen before or the current distance is smaller than the 
                        # proposed distance in seen[target]:
                        elif target not in seen or current_dist < seen[target]:
                            # update the seen distance
                            seen[target] = current_dist
                            # push it onto the heap so that we will look at its descendants later
                            push(fringe, (current_dist, next(c), target))

                            # update the paths till target:
                            if paths is not None:
                                edge_dict = G.get_edge_data(source, target, edge_id)
                                
                                edge_dict['walk'] = walking_edge
                                edge_dict['departure_time'] = dep_time_edge
                                
                                e_paths[target] = e_paths[source] + [(source, target, edge_dict)]


        # No path exists
        if  last_target not in e_paths:
            print('Error: No paths to the source')
            return pd.DataFrame(columns=['from', 'from_id', 'to', 'to_id', 'duration', 'total_duration',
                                         'departure_time', 'walk', 'no_change', 'mean_std_null'])

        
        # Validation
        if confidence == None or validate_path(e_paths[last_target], confidence, G):
            break
        else:
            confidence += confidence_step
            
    # Path validated
    if paths is not None:
        nodes_data = G.nodes(data=True)
        arrival_string = minute_to_string(dist[last_target])
        best_path = e_paths[last_target]
        departure_string = minute_to_string(best_path[0][2]['departure_time'])
        print('Going from {} ({}) to {} ({}) in {:.2f} minutes, departure at {}'.format(nodes_data[first_source]['name'],
                                                                                      first_source,
                                                                                      nodes_data[last_target]['name'],
                                                                                      last_target, 
                                                                                      dist[last_target] - departure_time,
                                                                                      minute_to_string(departure_time)))
        
        # Construct best path's data structure
        best_path_df = pd.DataFrame(columns=['from', 'from_id', 'to', 'to_id', 'duration', 'total_duration',
                                          'departure_time', 'walk', 'no_change', 'mean_std_null'])
        last_edge_info = False
        for source, target, edge_info in best_path:
            no_change = ('trip_id' in edge_info                                   # We're in a transport
                         and last_edge_info and 'trip_id' in last_edge_info       # and last edge also
                         and last_edge_info['trip_id'] == edge_info['trip_id'])   # and same trip_id
            mean_std_null = 'trip_id' in edge_info and 'mean' not in edge_info or 'std' not in edge_info
            
            current_path_dict = {'from': nodes_data[source]['name'],
                                 'from_id': source, 
                                 'to': nodes_data[target]['name'], 
                                 'to_id': target, 
                                 'duration': edge_info['duration'], 
                                 'total_duration': dist[target] - departure_time,
                                 'departure_time': minute_to_string(edge_info['departure_time']), 
                                 'walk':edge_info['walk'], 
                                 'no_change': no_change, 
                                 'mean_std_null': mean_std_null}
            best_path_df = best_path_df.append(current_path_dict, ignore_index=True)
            last_edge_info = edge_info
        
        with pd.option_context('display.max_rows', None, 
                               'display.max_columns', None, 
                               'display.max_colwidth', 15,
                               'display.expand_frame_repr', False):
            print(best_path_df)
        return best_path_df
    raise ValueError('Should not be here')
    return dist

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7620,application_1589299642358_2115,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
%%local
import os
import pandas as pd
username = os.environ['JUPYTERHUB_USER']

In [4]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

## Load graph data

In [5]:
trips = spark.read.format('orc').load('/data/sbb/timetables/orc/trips/000000_0')
calendar = spark.read.format('orc').load('/data/sbb/timetables/orc/calendar/000000_0')

nodes_df = spark.read.orc("/user/{}/nodes.orc".format(username))
edges_df = spark.read.orc("/user/{}/edges_with_mean_and_std_sec.orc".format(username))

#TODO: load durations_dicts

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
nodes = nodes_df.rdd.map(lambda r: (r[0], {'name': r['stop_name'],
                                              'lat': r['stop_lat'],
                                              'lon': r['stop_lon']})).collect()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
%%local
walking_times = pd.read_pickle('walking_edges.pickle')

In [8]:
%send_to_spark -i walking_times -t df -m 20000

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'walking_times' as 'walking_times' to Spark kernel

In [9]:
#reverse edges
#edges_walking = (walking_times.withColumnRenamed('source', 'temp')
#                 .withColumnRenamed('target', 'source')
#                 .withColumnRenamed('temp', 'target').toPandas())
edges_walking = walking_times.toPandas()
edges_walking['attrs'] = edges_walking.apply(lambda x: {'time': -1, 'duration': x['walk_duration']}, axis=1)
edges_walking = list(edges_walking[['source', 'target', 'attrs']].to_numpy())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Save duration dictionnaries if needed

In [10]:
# Conversion to json
durations_dicts_for_json = {}
for c in durations_dicts.keys():
    durations_dicts_for_json[c] = {str(k): v for k, v in durations_dicts[c].items()}

print('Length of json:', len(json.dumps(durations_dicts_for_json)))

# Save to hdfs
# TODO

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
name 'durations_dicts' is not defined
Traceback (most recent call last):
NameError: name 'durations_dicts' is not defined



## Choose time of arrival

In [11]:
day_id, arrival_hour, arrival_minute = 4, 12, 30

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Create graph

In [12]:
def create_edges_for_trip(edges_df, day_id, arrival_time):
    """
    create_edges_for_trip: constructs edges (and thus trips) that exist in a window of two hours before a given input time
    @input:
    - edges_df: df from which we construct the edges
    - day_id: id of week-day (e.g. wednesday is day id 2, see dictionnary above)
    - hour, minute: time at which we want to arrive somewhere (e.g. 11:30)
    @output: data frame of selected edges
    """
    #select only the trips that occur on that day:
    edges_df= edges_df.join(day_trips(day_id), on='trip_id')
    
    min_dep_time = arrival_time - 60*MAX_TRIP_DURATION
    
    #keep only those in a window of two hours:
    edges_df = edges_df.filter((col('departure_time') > min_dep_time) & 
                                            (col('arrival_time') <= arrival_time))
    
    #reverse edges
    #edges_df = (edges_df.withColumnRenamed('next_stop', 'temp')
    #            .withColumnRenamed('stop_id', 'next_stop')
    #            .withColumnRenamed('temp', 'stop_id'))

    edges = edges_df.rdd.map(lambda r: (r['stop_id'], r['next_stop'], {'duration': r['trip_duration'],
                                                                       'time': float(r['departure_time']),
                                                                       'trip_id': r['trip_id'],
                                                                       'mean': r['mean'],
                                                                       'std': r['std']})).collect()
    
    return edges + edges_walking

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
edges = create_edges_for_trip(edges_df, day_id, arrival_hour*60+arrival_minute)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
graph = nx.MultiDiGraph()
graph.add_nodes_from(nodes)
graph.add_edges_from(edges)

old_number_of_nodes = graph.number_of_nodes()
# Remove unreachable nodes
dists, paths = normal_dijkstra(graph, '8503000')
not_reachable = set(graph.nodes) - set(dists.keys())
_ = graph.remove_nodes_from(list(not_reachable))
print('{} nodes removed'.format(old_number_of_nodes - graph.number_of_nodes()))

# Temp for problem of name's encoding
import unicodedata
nodes_data = graph.nodes(data=True)
for n in graph.nodes:
    nodes_data[n]['name'] = unicodedata.normalize('NFKD', nodes_data[n]['name']).encode('ascii','ignore')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

18 nodes removed

## Temp: creation of the dict with 0.95 and 0.98

In [15]:
durations_dicts = {}
edge_and_data_tuple = zip(graph.edges(keys=True), 
              map(lambda x: x[2], graph.edges(data=True)))
edge_and_data_tuple = filter(lambda x: 'mean' in x[1] and 'std' in x[1], edge_and_data_tuple)
durations_dicts[0.98] = {e: {'duration': data['mean'] + compute_delay_uncertainty(data['mean'], 
                                                                                        data['std'], 
                                                                                        0.98)
                                   if data['mean'] != None and data['std'] != None
                                   else data['duration']
                                  } for e, data in edge_and_data_tuple}
durations_dicts[0.95] = {e: {'duration': data['mean'] + compute_delay_uncertainty(data['mean'], 
                                                                                        data['std'], 
                                                                                        0.95)
                                   if data['mean'] != None and data['std'] != None
                                   else data['duration']
                                  } for e, data in edge_and_data_tuple}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Run algorithm

In [16]:
# Tao's example (except for the departure time)
print('Without minimum confidence ->')
best_path1 = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049')
print('\nWith minimum confidence ->')
best_path2 = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049', confidence=0.98, durations_dicts=durations_dicts)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Without minimum confidence ->
Going from Zurich HB (8503000) to Zurich, Auzelg (8591049) in 29.00 minutes, departure at 10:30
             from         from_id              to           to_id  duration  total_duration departure_time   walk no_change mean_std_null
0       Zurich HB         8503000       Zurich HB  8503000:0:4...  2.135259        2.135259          10:30   True     False          True
1       Zurich HB  8503000:0:4...  Zurich Hard...     8503020:0:3  2.000000        9.000000          10:37  False     False         False
2  Zurich Hard...     8503020:0:3  Zurich Oerl...     8503006:0:8  5.000000       14.000000          10:39  False      True         False
3  Zurich Oerl...     8503006:0:8      Glattbrugg     8503310:0:3  2.000000       17.000000          10:45  False      True         False
4      Glattbrugg     8503310:0:3  Glattbrugg,...         8590620  3.063448       20.063448          10:47   True     False          True
5  Glattbrugg,...         8590620  Glattbrugg,

In [18]:
# From Triemli to Altstetten
print('Without minimum confidence ->')
best_path1 = dijkstra_with_time(graph, '8503610', arrival_hour*60+arrival_minute, last_target='8591057')
print('\nWith minimum confidence ->')
best_path2 = dijkstra_with_time(graph, '8503610', arrival_hour*60+arrival_minute, last_target='8591057', confidence=0.95, durations_dicts=durations_dicts)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Without minimum confidence ->
Going from Zurich, Triemli (8503610) to Zurich Altstetten, Bahnhof N (8591057) in 17.14 minutes, departure at 10:30
             from  from_id              to    to_id  duration  total_duration departure_time   walk no_change mean_std_null
0  Zurich, Tri...  8503610  Zurich, In ...  8591214  1.000000        2.000000          10:31  False     False         False
1  Zurich, In ...  8591214  Zurich, Gol...  8591163  1.000000        3.000000          10:32  False      True         False
2  Zurich, Gol...  8591163  Zurich, Alb...  8591036  2.000000        5.000000          10:33  False      True         False
3  Zurich, Alb...  8591036  Zurich, Alb...  8591037  0.000000        5.000000          10:35  False      True         False
4  Zurich, Alb...  8591037  Zurich, Unt...  8591408  2.000000        7.000000          10:35  False      True         False
5  Zurich, Unt...  8591408  Zurich, Rau...  8591311  1.000000        8.000000          10:37  False      True 

## Cells to keep

In [40]:
# Weird attributes?
print(graph.get_edge_data('8503000:0:41/42', '8503020:0:3', 0))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{'duration': 2.0, 'std': 1.05697167217, 'time': 682.0, 'trip_id': u'234.TA.26-15-j19-1.41.H', 'mean': 0.27319172912666667}

In [29]:
# Proportion of null mean or std in non-walking edges
(len(filter(lambda x: x[2]['mean'] == None or x[2]['std'] == None, filter(lambda x: 'mean' in x[2] and 'std' in x[2], graph.edges(data=True))))
 / float(len(filter(lambda x: 'mean' in x[2] and 'std' in x[2], graph.edges(data=True)))))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.046632828786368166