In [1]:
%%configure
{"pyFiles": ["/user/gottraux/dijkstra_algorithms.py"],
 "conf": {
    "spark.app.name": "dslab-group_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7611,application_1589299642358_2106,pyspark,idle,Link,Link,
7632,application_1589299642358_2126,pyspark,idle,Link,Link,
7633,application_1589299642358_2127,pyspark,busy,Link,Link,
7644,application_1589299642358_2140,pyspark,idle,Link,Link,
7670,application_1589299642358_2166,pyspark,busy,Link,Link,
7674,application_1589299642358_2170,pyspark,idle,Link,Link,
7676,application_1589299642358_2172,pyspark,busy,Link,Link,
7677,application_1589299642358_2173,pyspark,dead,Link,Link,
7681,application_1589299642358_2177,pyspark,idle,Link,Link,
7683,application_1589299642358_2179,pyspark,idle,Link,Link,


## Imports and helper functions

In [2]:
import pickle
import json
import networkx as nx
import pandas as pd
from pyspark.sql.functions import col

"""
To load (or reload) into hdfs:
hdfs dfs -rm /user/${JUPYTERHUB_USER}/dijkstra_algorithms.py 2>/dev/null
hdfs dfs -copyFromLocal notebooks/dijkstra_algorithms.py /user/${JUPYTERHUB_USER}/
"""
from dijkstra_algorithms import *

MAX_TRIP_DURATION = 2 #duration in hour 

days_dict = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday'}
def day_trips(*day_ids):
    """
    day_trips: gives the trip_ids that operate on certain days
    input: a variable number of day ids
    output:s spark dataframe with trip_ids
    
    """
    days = [days_dict[day_id] for day_id in day_ids]
    where_clause = " and ".join(days)

    day_services = calendar.where(where_clause).select('service_id')
    return day_services.join(trips, on='service_id').select('trip_id')

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7703,application_1589299642358_2199,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
%%local
import os
import pandas as pd
username = os.environ['JUPYTERHUB_USER']

In [4]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

## Load graph data

In [5]:
trips = spark.read.format('orc').load('/data/sbb/timetables/orc/trips/000000_0')
calendar = spark.read.format('orc').load('/data/sbb/timetables/orc/calendar/000000_0')

nodes_df = spark.read.orc("/user/{}/nodes.orc".format(username))
edges_df = spark.read.orc("/user/{}/edges_with_mean_and_std_sec.orc".format(username))

durations_dicts = json.loads(sc.textFile('/user/{}/durations_for_confidence_.json'.format(username)).collect()[0])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
nodes = nodes_df.rdd.map(lambda r: (r[0], {'name': r['stop_name'],
                                              'lat': r['stop_lat'],
                                              'lon': r['stop_lon']})).collect()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
%%local
walking_times = pd.read_pickle('walking_edges.pickle')

In [9]:
%send_to_spark -i walking_times -t df -m 20000

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'walking_times' as 'walking_times' to Spark kernel

In [10]:
#reverse edges
#edges_walking = (walking_times.withColumnRenamed('source', 'temp')
#                 .withColumnRenamed('target', 'source')
#                 .withColumnRenamed('temp', 'target').toPandas())
edges_walking = walking_times.toPandas()
edges_walking['attrs'] = edges_walking.apply(lambda x: {'time': -1, 'duration': x['walk_duration']}, axis=1)
edges_walking = list(edges_walking[['source', 'target', 'attrs']].to_numpy())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Create duration dictionnaries if needed

Code commented, don't have the permission to **change** a file, can write if put another path

durations_dicts = {}
edge_and_data_tuple = zip(graph.edges(keys=True),
              map(lambda x: x[2], graph.edges(data=True)))
edge_and_data_tuple = filter(lambda x: 'mean' in x[1] and 'std' in x[1], edge_and_data_tuple)
for c in [0.25, 0.5, 0.75, 0.9, 0.95, 0.98, 0.99]:
    durations_dicts[c] = {e: {'duration': data['mean'] + compute_delay_uncertainty(data['mean'], 
                                                                                            data['std'], 
                                                                                            c)
                                       if data['mean'] != None and data['std'] != None
                                       else data['duration']
                                      } for e, data in edge_and_data_tuple}
    
""" Conversion to json """
durations_dicts_for_json = {}
for c in durations_dicts.keys():
    durations_dicts_for_json[c] = {str(k): v for k, v in durations_dicts[c].items()}

print('Length of json:', len(json.dumps(durations_dicts_for_json))) -> 30106955

""" Save to hdfs """
sc.parallelize([json.dumps(durations_dicts_for_json)]).coalesce(1).saveAsTextFile('/user/{}/durations_for_confidence_.json'.format(username))

## Choose time of arrival

In [12]:
day_id, arrival_hour, arrival_minute = 4, 12, 30

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Create graph

In [13]:
def create_edges_for_trip(edges_df, day_id, arrival_time):
    """
    create_edges_for_trip: constructs edges (and thus trips) that exist in a window of two hours before a given input time
    @input:
    - edges_df: df from which we construct the edges
    - day_id: id of week-day (e.g. wednesday is day id 2, see dictionnary above)
    - hour, minute: time at which we want to arrive somewhere (e.g. 11:30)
    @output: data frame of selected edges
    """
    #select only the trips that occur on that day:
    edges_df= edges_df.join(day_trips(day_id), on='trip_id')
    
    min_dep_time = arrival_time - 60*MAX_TRIP_DURATION
    
    #keep only those in a window of two hours:
    edges_df = edges_df.filter((col('departure_time') > min_dep_time) & 
                                            (col('arrival_time') <= arrival_time))
    
    #reverse edges
    #edges_df = (edges_df.withColumnRenamed('next_stop', 'temp')
    #            .withColumnRenamed('stop_id', 'next_stop')
    #            .withColumnRenamed('temp', 'stop_id'))

    edges = edges_df.rdd.map(lambda r: (r['stop_id'], r['next_stop'], {'duration': r['trip_duration'],
                                                                       'time': float(r['departure_time']),
                                                                       'trip_id': r['trip_id'],
                                                                       'mean': r['mean'],
                                                                       'std': r['std']})).collect()
    
    return edges + edges_walking

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
edges = create_edges_for_trip(edges_df, day_id, arrival_hour*60+arrival_minute)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
graph = nx.MultiDiGraph()
graph.add_nodes_from(nodes)
graph.add_edges_from(edges)

old_number_of_nodes = graph.number_of_nodes()
# Remove unreachable nodes
dists, paths = normal_dijkstra(graph, '8503000')
not_reachable = set(graph.nodes) - set(dists.keys())
_ = graph.remove_nodes_from(list(not_reachable))
print('{} nodes removed'.format(old_number_of_nodes - graph.number_of_nodes()))

# Temp for problem of name's encoding
import unicodedata
nodes_data = graph.nodes(data=True)
for n in graph.nodes:
    nodes_data[n]['name'] = unicodedata.normalize('NFKD', nodes_data[n]['name']).encode('ascii','ignore')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

18 nodes removed

## Run algorithm

In [16]:
# Tao's example (except for the departure time)
print('Without minimum confidence ->')
best_path1 = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049')
print('\nWith minimum confidence ->')
best_path2 = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049', confidence=0.98, durations_dicts=durations_dicts)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Without minimum confidence ->
Going from Zurich HB (8503000) to Zurich, Auzelg (8591049) in 29.00 minutes, departure at 10:30
             from         from_id              to           to_id  duration  total_duration departure_time   walk no_change mean_std_null
0       Zurich HB         8503000       Zurich HB  8503000:0:4...  2.135259        2.135259          10:30   True     False          True
1       Zurich HB  8503000:0:4...  Zurich Hard...     8503020:0:3  2.000000        9.000000          10:37  False     False         False
2  Zurich Hard...     8503020:0:3  Zurich Oerl...     8503006:0:8  5.000000       14.000000          10:39  False      True         False
3  Zurich Oerl...     8503006:0:8      Glattbrugg     8503310:0:3  2.000000       17.000000          10:45  False      True         False
4      Glattbrugg     8503310:0:3  Glattbrugg,...         8590620  3.063448       20.063448          10:47   True     False          True
5  Glattbrugg,...         8590620  Glattbrugg,

In [18]:
# From Triemli to Altstetten
print('Without minimum confidence ->')
best_path1 = dijkstra_with_time(graph, '8503610', arrival_hour*60+arrival_minute, last_target='8591057')
print('\nWith minimum confidence ->')
best_path2 = dijkstra_with_time(graph, '8503610', arrival_hour*60+arrival_minute, last_target='8591057', confidence=0.95, durations_dicts=durations_dicts)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Without minimum confidence ->
Going from Zurich, Triemli (8503610) to Zurich Altstetten, Bahnhof N (8591057) in 17.14 minutes, departure at 10:30
             from  from_id              to    to_id  duration  total_duration departure_time   walk no_change mean_std_null
0  Zurich, Tri...  8503610  Zurich, In ...  8591214  1.000000        2.000000          10:31  False     False         False
1  Zurich, In ...  8591214  Zurich, Gol...  8591163  1.000000        3.000000          10:32  False      True         False
2  Zurich, Gol...  8591163  Zurich, Alb...  8591036  2.000000        5.000000          10:33  False      True         False
3  Zurich, Alb...  8591036  Zurich, Alb...  8591037  0.000000        5.000000          10:35  False      True         False
4  Zurich, Alb...  8591037  Zurich, Unt...  8591408  2.000000        7.000000          10:35  False      True         False
5  Zurich, Unt...  8591408  Zurich, Rau...  8591311  1.000000        8.000000          10:37  False      True 

## Cells to keep

In [40]:
# Weird attributes?
print(graph.get_edge_data('8503000:0:41/42', '8503020:0:3', 0))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{'duration': 2.0, 'std': 1.05697167217, 'time': 682.0, 'trip_id': u'234.TA.26-15-j19-1.41.H', 'mean': 0.27319172912666667}

In [29]:
# Proportion of null mean or std in non-walking edges
(len(filter(lambda x: x[2]['mean'] == None or x[2]['std'] == None, filter(lambda x: 'mean' in x[2] and 'std' in x[2], graph.edges(data=True))))
 / float(len(filter(lambda x: 'mean' in x[2] and 'std' in x[2], graph.edges(data=True)))))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.046632828786368166