# Run:
Only notebook to run ? 

In [1]:
%%configure
{"pyFiles": ["/user/gottraux/dijkstra_algorithms.py"],
 "conf": {
    "spark.app.name": "dslab-group_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
8869,application_1589299642358_3401,pyspark,idle,Link,Link,
8874,application_1589299642358_3406,pyspark,idle,Link,Link,
8875,application_1589299642358_3407,pyspark,idle,Link,Link,
8877,application_1589299642358_3409,pyspark,idle,Link,Link,
8880,application_1589299642358_3412,pyspark,idle,Link,Link,
8883,application_1589299642358_3415,pyspark,idle,Link,Link,
8884,application_1589299642358_3416,pyspark,busy,Link,Link,
8885,application_1589299642358_3417,pyspark,busy,Link,Link,
8886,application_1589299642358_3418,pyspark,busy,Link,Link,
8888,application_1589299642358_3420,pyspark,busy,Link,Link,


### Imports and helper functions:

In [2]:
import pickle
import json
import networkx as nx
import pandas as pd
from pyspark.sql.functions import col

"""
To load (or reload) into hdfs:
hdfs dfs -rm /user/${JUPYTERHUB_USER}/dijkstra_algorithms.py 2>/dev/null
hdfs dfs -copyFromLocal notebooks/dijkstra_algorithms.py /user/${JUPYTERHUB_USER}/
"""
from dijkstra_algorithms import *

MAX_TRIP_DURATION = 2 #duration in hour 

days_dict = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday'}
def day_trips(*day_ids):
    """
    day_trips: gives the trip_ids that operate on certain days
    input: a variable number of day ids
    output:s spark dataframe with trip_ids
    
    """
    days = [days_dict[day_id] for day_id in day_ids]
    where_clause = " and ".join(days)

    day_services = calendar.where(where_clause).select('service_id')
    return day_services.join(trips, on='service_id').select('trip_id')

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
8906,application_1589299642358_3438,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
%%local
import os
import pandas as pd
username = os.environ['JUPYTERHUB_USER']
#username = 'gottraux'

In [4]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

## Load graph data

In [5]:
trips = spark.read.format('orc').load('/data/sbb/timetables/orc/trips/000000_0')
calendar = spark.read.format('orc').load('/data/sbb/timetables/orc/calendar/000000_0')
stop_times = spark.read.format('orc').load('/data/sbb/timetables/orc/stop_times/000000_0')
stops = spark.read.format('orc').load('/data/sbb/timetables/orc/stops/000000_0')

nodes_df = spark.read.orc("/user/{}/nodes.orc".format(username))
edges_df = spark.read.orc("/user/{}/edges_with_mean_and_std_sec.orc".format(username))

#durations_dicts = json.loads(sc.textFile('/user/{}/durations_for_confidence_.json'.format(username)).collect()[0])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
edges_df.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------+----------+------------+--------------+---------+-------------+----+----+
|             trip_id|stop_id|train_type|arrival_time|departure_time|next_stop|trip_duration|mean| std|
+--------------------+-------+----------+------------+--------------+---------+-------------+----+----+
|30.TA.30-24-Y-j19...|8503111|       Bus|         496|           496|  8503102|          4.0|null|null|
|34.TA.30-24-Y-j19...|8503111|       Bus|         539|           539|  8503102|          4.0|null|null|
|31.TA.30-31-Y-j19...|8503111|       Bus|         532|           532|  8503103|          5.0|null|null|
+--------------------+-------+----------+------------+--------------+---------+-------------+----+----+
only showing top 3 rows

In [7]:
edges_df = edges_df.withColumnRenamed('stop_id', 'temp').withColumnRenamed('next_stop', 'stop_id').withColumnRenamed('temp', 'next_stop')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
edges_df.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------+----------+------------+--------------+-------+-------------+----+----+
|             trip_id|next_stop|train_type|arrival_time|departure_time|stop_id|trip_duration|mean| std|
+--------------------+---------+----------+------------+--------------+-------+-------------+----+----+
|30.TA.30-24-Y-j19...|  8503111|       Bus|         496|           496|8503102|          4.0|null|null|
|34.TA.30-24-Y-j19...|  8503111|       Bus|         539|           539|8503102|          4.0|null|null|
|31.TA.30-31-Y-j19...|  8503111|       Bus|         532|           532|8503103|          5.0|null|null|
+--------------------+---------+----------+------------+--------------+-------+-------------+----+----+
only showing top 3 rows

In [9]:
nodes = nodes_df.rdd.map(lambda r: (r[0], {'name': r['stop_name'],
                                              'lat': r['stop_lat'],
                                              'lon': r['stop_lon']})).collect()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
%%local
walking_times = pd.read_pickle('walking_edges.pickle')

In [11]:
%send_to_spark -i walking_times -t df -m 20000

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'walking_times' as 'walking_times' to Spark kernel

In [12]:
# reverse edges:
walking_times = (walking_times.withColumnRenamed('source', 'temp').withColumnRenamed('target', 'source').withColumnRenamed('temp', 'target'))

edges_walking = walking_times.toPandas()
edges_walking['attrs'] = edges_walking.apply(lambda x: {'time': -1, 'duration': x['walk_duration']}, axis=1)
edges_walking = list(edges_walking[['source', 'target', 'attrs']].to_numpy())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
walking_times.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+---------------+-------------+
|         target|         source|walk_duration|
+---------------+---------------+-------------+
|8503000:0:41/42|8503000:0:43/44|          7.0|
|8503000:0:41/42|   8503000:0:14|          7.0|
|8503000:0:41/42|   8503000:0:16|          7.0|
+---------------+---------------+-------------+
only showing top 3 rows

Remove dics from dijkstra time for the moment and make it return the mean and std as well because we need it to validate. 

## Choose time of arrival

In [14]:
day_id, arrival_hour, arrival_minute = 4, 12, 30

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Create graph

In [15]:
def create_edges_for_trip(edges_df, day_id, arrival_time):
    """
    create_edges_for_trip: constructs edges (and thus trips) that exist in a window of two hours before a given input time
    @input:
    - edges_df: df from which we construct the edges
    - day_id: id of week-day (e.g. wednesday is day id 2, see dictionnary above)
    - hour, minute: time at which we want to arrive somewhere (e.g. 11:30)
    @output: data frame of selected edges
    """
    #select only the trips that occur on that day:
    edges_df= edges_df.join(day_trips(day_id), on='trip_id')
    
    min_dep_time = arrival_time - 60*MAX_TRIP_DURATION
    
    #keep only those in a window of two hours:
    edges_df = edges_df.filter((col('departure_time') > min_dep_time) & 
                                            (col('arrival_time') <= arrival_time))
    
    edges = edges_df.rdd.map(lambda r: (r['stop_id'], r['next_stop'], {'duration': r['trip_duration'],
                                                                       'time': float(r['departure_time']),
                                                                       'trip_id': r['trip_id'],
                                                                       'mean': r['mean'],
                                                                       'std': r['std']})).collect()
    
    return edges + edges_walking

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
edges = create_edges_for_trip(edges_df, day_id, arrival_hour*60+arrival_minute)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
graph = nx.MultiDiGraph()
graph.add_nodes_from(nodes)
graph.add_edges_from(edges)

old_number_of_nodes = graph.number_of_nodes()
# Remove unreachable nodes
dists, paths = normal_dijkstra(graph, '8503000')
not_reachable = set(graph.nodes) - set(dists.keys())
_ = graph.remove_nodes_from(list(not_reachable))
print('{} nodes removed'.format(old_number_of_nodes - graph.number_of_nodes()))

# Temp for problem of name's encoding
import unicodedata
nodes_data = graph.nodes(data=True)
for n in graph.nodes:
    nodes_data[n]['name'] = unicodedata.normalize('NFKD', nodes_data[n]['name']).encode('ascii','ignore')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

18 nodes removed

In [18]:
# Check if reversed, should be non empty:
graph.get_edge_data('8572602','8502553')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{0: {'duration': 2.0, 'std': None, 'time': 643.0, 'trip_id': u'3.TA.1-231-j19-1.1.H', 'mean': None}, 1: {'duration': 2.0, 'std': None, 'time': 703.0, 'trip_id': u'5.TA.1-231-j19-1.1.H', 'mean': None}, 2: {'duration': 2.0, 'std': None, 'time': 673.0, 'trip_id': u'4.TA.1-231-j19-1.1.H', 'mean': None}, 3: {'duration': 2.0, 'std': None, 'time': 733.0, 'trip_id': u'68.TA.1-231-j19-1.9.H', 'mean': None}, 4: {'duration': 2.0, 'std': None, 'time': 721.0, 'trip_id': u'40.TA.1-231-j19-1.3.H', 'mean': None}}

## dijkstra:

In [19]:
def dijkstra_with_time(G, first_source, arrival_time, last_target, confidence=None, 
                       confidence_step=0.01, durations_dicts=None, paths=None, departure_time = None):
    G = G.copy()
    #departure_time = arrival_time - MAX_TRIP_DURATION*60
    departure_time = departure_time
    while True:
        """
        # Update durations according to confidence
        if confidence != None:
            if durations_dicts == None:
                raise ValueError('You must pass durations_dicts for the confidence.')
            # Load dict with modifications
            if confidence not in durations_dicts:
                edge_and_data_tuple = zip(G.edges(keys=True), 
                              map(lambda x: x[2], G.edges(data=True)))
                edge_and_data_tuple = filter(lambda x: 'mean' in x[1] and 'std' in x[1], edge_and_data_tuple)
                durations_dicts[confidence] = {e: {'duration': data['mean'] + compute_delay_uncertainty(data['mean'], 
                                                                                                        data['std'], 
                                                                                                        confidence)
                                                   if data['mean'] != None and data['std'] != None
                                                   else data['duration']
                                                  } for e, data in edge_and_data_tuple}
            
            # Update graph
            nx.set_edge_attributes(G, durations_dicts[confidence])
        """
        
        if not G.is_directed():
            raise ValueError('Input graph is not directed while it should be.')

        G_succ = G.succ 
        
        # paths stores the nodes in dijkstra's shortest path
        paths = {first_source: [first_source]}
        
        # stores the edges in dijkstra's shortest path
        e_paths = {first_source: []}
        
        # dictionary of final distances to nodes
        dist = {}  
        
        # dictionnary of whether it's the first time a node is visited
        seen = {first_source: departure_time}

        # use heapq with (distance,label) tuples
        push = heappush
        pop = heappop
        c = count()
        fringe = []  
        
        # push the source as the first node on the heap
        push(fringe, (departure_time, next(c), first_source))

        # while heap not empty
        while fringe:
            
            # take the node to look at: 
            (d, _, source) = pop(fringe)

            # check if node has already been looked at and has a final shortest distance: 
            if source in dist:
                continue  # already searched this node so go to another

            # take the distance to the node from the heap 
            # source starts with distance = departure_time
            dist[source] = d

            #stop if the source is the last_target. 
            if source == last_target:
                break

            # Look at all direct descendents from the source node: 
            for target, edges in G_succ[source].items():
                # Because it's a multigraph, need to look at all edges between two nodes:
                for edge_id in edges:
                    
                    # Check if walking edge: 
                    # walking edges have a departure time of -1
                    dep_time_edge = G.get_edge_data(source, target, edge_id)['time']
                    
                    if dep_time_edge == -1:
                        walking_edge = True
                        current_trip_id = None
                        # set the departure time to the distance to that node as we can leave immediatly
                        dep_time_edge = d
                    else:
                        walking_edge = False
                        current_trip_id = G.get_edge_data(source, target, edge_id)['trip_id']
                        
                    # take only edges that have a departure time bigger 
                    # than the time it takes to get to the node
                    if dep_time_edge < dist[source]:
                       # move on to next edge if it's earlier 
                        continue
                        
                    # Check if edge is feasible (also accoring to confidence)
                    # Check if last edge taken was not a walking edge
                    # Check if there is at least a path of length 1 to the source node 
                    # (e.g. that this node is not the original source)
                    if len(e_paths[source]) >= 1 and not e_paths[source][-1][2]['walk']:
                        last_edge_source, last_edge_target, last_edge_info = e_paths[source][-1]
                        last_delay = compute_delay_uncertainty(last_edge_info['mean'], 
                                                                   last_edge_info['std'], 
                                                                   confidence)
                        # If we make a transport-> walk change
                        if walking_edge:
                            # add delay to departure time of walk as we will leave later
                            dep_time_edge += last_delay
                        else:
                            # If we make a transport->transport change, check if we have time to change
                            # To change we need that the next connection leaves >= 2 min + delay of transport
                            # If not we cannot take that edge
                            if current_trip_id != last_edge_info['trip_id']\
                            and dep_time_edge < dist[source] + 2 + last_delay:
                                continue

                    # Get the duration between two nodes:
                    duration_cost = G.get_edge_data(source, target, edge_id)['duration']
                    
                    if duration_cost is None:
                            raise ValueError('Edge without a duration.')

                    # Add the weight to the current distance to a node
                    current_dist = dep_time_edge + duration_cost

                    # if target has already been visited once and has a final distance:
                    if target in dist:
                            # if we find a distance smaller than the actual distance in dic
                            # raise error because dic distances contains only final distances
                            if current_dist < dist[target]:
                                raise ValueError('Contradictory paths found:',
                                                     'negative weights?')

                    # either node has been seen before or the current distance is smaller than the 
                    # proposed distance in seen[target]:
                    elif target not in seen or current_dist < seen[target]:
                        # update the seen distance
                        seen[target] = current_dist
                        # push it onto the heap so that we will look at its descendants later
                        push(fringe, (current_dist, next(c), target))

                        # update the paths till target:
                        if paths is not None:
                            edge_dict = G.get_edge_data(source, target, edge_id)
                            
                            edge_dict['walk'] = walking_edge
                            edge_dict['departure_time'] = dep_time_edge
                            
                            e_paths[target] = e_paths[source] + [(source, target, edge_dict)]


        # If there is no path to the last_target:
        if  last_target not in e_paths:
            print('Error: No paths to the source')
            return pd.DataFrame(columns=['from', 'from_id', 'to', 'to_id', 'duration', 'total_duration',
                                         'departure_time', 'walk', 'no_change', 'mean_std_null','mean','std'])

        
        # Validation: 
        if confidence == None or validate_path(e_paths[last_target], confidence, G):
            break
        else:
            # else increase confidence by a confidence step and start again: 
            confidence += confidence_step
            
    # Path validated
    if paths is not None:
        nodes_data = G.nodes(data=True)
        arrival_string = minute_to_string(dist[last_target])
        best_path = e_paths[last_target]
        departure_string = minute_to_string(best_path[0][2]['departure_time'])
        print('Going from {} ({}) to {} ({}) in {:.2f} minutes, departure at {}'.format(nodes_data[first_source]['name'],
                                                                                      first_source,
                                                                                      nodes_data[last_target]['name'],
                                                                                      last_target, 
                                                                                      dist[last_target] - departure_time,
                                                                                      minute_to_string(departure_time)))
        
        # Construct best path's data structure
        best_path_df = pd.DataFrame(columns=['from', 'from_id', 'to', 'to_id', 'duration', 'total_duration',
                                          'departure_time', 'walk', 'no_change', 'mean_std_null', 'mean','std'])
        last_edge_info = False
        for source, target, edge_info in best_path:
            no_change = ('trip_id' in edge_info                                   # We're in a transport
                         and last_edge_info and 'trip_id' in last_edge_info       # and last edge also
                         and last_edge_info['trip_id'] == edge_info['trip_id'])   # and same trip_id
            mean_std_null = 'trip_id' in edge_info and 'mean' not in edge_info or 'std' not in edge_info
            
            if not mean_std_null:
                mean = edge_info['mean']
                std = edge_info['std']
                if  edge_info['mean'] == None or  edge_info['std'] == None: 
                    mean = edge_info['duration']
                    std = 0
            if 'mean' not in edge_info or 'std' not in edge_info:
                mean = edge_info['duration']
                std = 0
                
            
            current_path_dict = {'from': nodes_data[source]['name'],
                                 'from_id': source, 
                                 'to': nodes_data[target]['name'], 
                                 'to_id': target, 
                                 'duration': edge_info['duration'], 
                                 'total_duration': dist[target] - departure_time,
                                 'departure_time': minute_to_string(edge_info['departure_time']), 
                                 'walk':edge_info['walk'], 
                                 'no_change': no_change, 
                                 'mean_std_null': mean_std_null,
                                'mean':mean,
                                'std':std}
            best_path_df = best_path_df.append(current_path_dict, ignore_index=True)
            last_edge_info = edge_info
        
        with pd.option_context('display.max_rows', None, 
                               'display.max_columns', None, 
                               'display.max_colwidth', 15,
                               'display.expand_frame_repr', False):
            print(best_path_df)
        return best_path_df
    raise ValueError('Should not be here')
    return dist

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [51]:
def dijkstra_reversed(G, first_source, arrival_time, last_target, confidence=None, 
                       confidence_step=0.01, durations_dicts=None):
    G = G.copy()
    departure_time = -arrival_time

    # inverse direction 
    temp = first_source
    first_source = last_target
    last_target = temp
        
    if not G.is_directed():
        raise ValueError('Input graph is not directed while it should be.')

    G_succ = G.succ 
        
    # paths stores the nodes in dijkstra's shortest path
    paths = {first_source: [first_source]}
    # stores the edges in dijkstra's shortest path
    e_paths = {first_source: []}
        
    # dictionary of final distances to nodes
    dist = {}  
        
    # dictionnary of whether it's the first time a node is visited
    seen = {first_source: departure_time}

    # use heapq with (distance,label) tuples
    push = heappush
    pop = heappop
    c = count()
    fringe = []  
        
    # push the source as the first node on the heap
    push(fringe, (departure_time, next(c), first_source))
    # while heap not empty
    while True:
        while fringe:
            # take the node to look at: 
            (d, _, source) = pop(fringe)

             # check if node has already been looked at and has a final shortest distance: 
            if source in dist:
                  continue  # already searched this node so go to another

            # take the distance to the node from the heap 
            # source starts with distance = departure_time
            dist[source] = d

            #stop if the source is the last_target. 
            if source == last_target:
                break

            # Look at all direct descendents from the source node: 
            for target, edges in G_succ[source].items():
                # Because it's a multigraph, need to look at all edges between two nodes:
                for edge_id in edges:  
                    # Check if walking edge: 
                    # walking edges have a departure time of -1
                    dep_time_edge = - G.get_edge_data(source, target, edge_id)['time']

                    # Get the duration between two nodes:
                    duration_cost = G.get_edge_data(source, target, edge_id)['duration']

                    if dep_time_edge == 1:
                        walking_edge = True
                        current_trip_id = None
                        # set the departure time to the 
                        # distance to that node as we can leave immediatly
                        dep_time_edge = d + duration_cost                  

                    else:
                        walking_edge = False
                        current_trip_id = G.get_edge_data(source, target, edge_id)['trip_id']


                    if duration_cost is None:
                            raise ValueError('Edge without a duration.')

                    # Add the weight to the current distance to a node

                    current_dist = dep_time_edge           
                    # take only edges that have a departure time bigger 
                    # than the time it takes to get to the node

                    if dep_time_edge - duration_cost < dist[source]:
                            # move on to next edge if it's earlier 
                        continue

                    # Check if edge is feasible (also accoring to confidence)
                    # Check if last edge taken was not a walking edge
                    # Check if there is at least a path of length 1 to the source node 
                    # (e.g. that this node is not the original source)
                    if len(e_paths[source]) >= 1 and not walking_edge:
                        last_edge_source, last_edge_target, last_edge_info = e_paths[source][-1]
                        mean = G.get_edge_data(source, target, edge_id)['mean']
                        std = G.get_edge_data(source, target, edge_id)['std']
                        # now compute delay for current edge because in reverse
                        last_delay = compute_delay_uncertainty(mean, std, confidence)

                        # If we make a transport-> walk change
                        if last_edge_info['walk']:
                            # add delay to departure time of walk as we will leave later
                            dep_time_edge -= last_delay
                        else:
                            # If we make a transport->transport change, check if we have time to change
                            # To change we need that the next connection leaves >= 2 min + delay of transport
                            # If not we cannot take that edge
                            #if current_trip_id != last_edge_info['trip_id']\
                            #and dep_time_edge < dist[source] + 2 + last_delay:
                            #   continue
                            if current_trip_id != last_edge_info['trip_id'] and dep_time_edge - last_delay - 2 - duration_cost < dist[source]:
                                continue                

                    # if target has already been visited once and has a final distance:
                    if target in dist:
                            # if we find a distance smaller than the actual distance in dic
                            # raise error because dic distances contains only final distances
                            if current_dist < dist[target]:
                                raise ValueError('Contradictory paths found:','negative weights?')

                    # either node has been seen before or the current distance is smaller than the 
                    # proposed distance in seen[target]:
                    if target not in seen or current_dist < seen[target]:

                        # update the seen distance
                        seen[target] = current_dist
                        # push it onto the heap so that we will look at its descendants later
                        push(fringe, (current_dist, next(c), target))

                        # update the paths till target:
                        if paths is not None:

                            edge_dict = G.get_edge_data(source, target, edge_id)

                            edge_dict['walk'] = walking_edge
                            edge_dict['departure_time'] = dep_time_edge
                            e_paths[target] = e_paths[source] + [(source, target, edge_dict)]    
        
        # If there is no path to the last_target:
        if  last_target not in e_paths:
            print('Error: No paths to the source')
            return pd.DataFrame(columns=['from', 'from_id', 'to', 'to_id', 'duration', 'total_duration',
                                             'departure_time', 'walk', 'no_change', 'mean_std_null','mean','std'])


        # Validation: 
        if confidence == None or validate_path(e_paths[last_target], confidence, G):
            break
        else:
            # else increase confidence by a confidence step and start again: 
            confidence += confidence_step
        # Path validated
    if paths is not None:
        nodes_data = G.nodes(data=True)
        arrival_string = minute_to_string(dist[last_target])
        
        #reverse path:
        best_path = []
        for edge in e_paths[last_target][::-1]: 
            edge = (edge[1], edge[0], edge[2])
            best_path.append(edge)           
        
        departure_string = minute_to_string(-best_path[0][2]['departure_time'])
        print('Going from {} ({}) to {} ({}) in {:.2f} minutes, departure at {}'.format(nodes_data[first_source]['name'],
                                                                                      first_source,
                                                                                      nodes_data[last_target]['name'],
                                                                                      last_target, 
                                                                                      arrival_time + best_path[0][2]['departure_time'] ,
                                                                                      minute_to_string(-best_path[0][2]['departure_time'])))
        
        # Construct best path's data structure
        best_path_df = pd.DataFrame(columns=['from', 'from_id', 'to', 'to_id', 'duration', 'total_duration',
                                          'departure_time', 'walk', 'no_change', 'mean_std_null', 'mean','std'])
        last_edge_info = False
        for source, target, edge_info in best_path:
            no_change = ('trip_id' in edge_info                                   # We're in a transport
                         and last_edge_info and 'trip_id' in last_edge_info       # and last edge also
                         and last_edge_info['trip_id'] == edge_info['trip_id'])   # and same trip_id
            mean_std_null = 'trip_id' in edge_info and 'mean' not in edge_info or 'std' not in edge_info
            
            if not mean_std_null:
                mean = edge_info['mean']
                std = edge_info['std']
                if  edge_info['mean'] == None or  edge_info['std'] == None: 
                    mean = edge_info['duration']
                    std = 0
            if 'mean' not in edge_info or 'std' not in edge_info:
                mean = edge_info['duration']
                std = 0
                
            
            current_path_dict = {'from': nodes_data[source]['name'],
                                 'from_id': source, 
                                 'to': nodes_data[target]['name'], 
                                 'to_id': target, 
                                 'duration': edge_info['duration'], 
                                 'total_duration': dist[target],
                                 'departure_time': minute_to_string(-edge_info['departure_time']), 
                                 'walk':edge_info['walk'], 
                                 'no_change': no_change, 
                                 'mean_std_null': mean_std_null,
                                'mean':mean,
                                'std':std}
            best_path_df = best_path_df.append(current_path_dict, ignore_index=True)
            last_edge_info = edge_info
        
        #with pd.option_context('display.max_rows', None, 
        #                       'display.max_columns', None, 
        #                       'display.max_colwidth', 15,
        #                       'display.expand_frame_repr', False):
        #        print(best_path_df)
        return best_path_df
    raise ValueError('Should not be here')
    return e_paths[last_target], dist[last_target]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [52]:
source = '8503006:0:8'
target = '8503006:0:2'

day_id, arrival_hour, arrival_minute = 4, 12, 30
print(arrival_hour*60+arrival_minute)
path = dijkstra_reversed(graph, source, arrival_hour*60+arrival_minute, 
                               last_target=target, confidence=0.98)

with pd.option_context('display.max_rows', None, 
                               'display.max_columns', None, 
                               'display.max_colwidth', 15,
                               'display.expand_frame_repr', False):
    print(path)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

750
Going from Zurich Oerlikon (8503006:0:2) to Zurich Oerlikon (8503006:0:8) in 3.00 minutes, departure at 12:27
             from      from_id              to        to_id  duration total_duration departure_time  walk no_change mean_std_null  mean std
0  Zurich Oerl...  8503006:0:8  Zurich Oerl...  8503006:0:2       3.0           -750          12:27  True     False          True   3.0   0

In [42]:
source = '8503000'
target = '8591049'

day_id, arrival_hour, arrival_minute = 4, 12, 30
print(arrival_hour*60+arrival_minute)
path = dijkstra_reversed(graph, source, arrival_hour*60+arrival_minute, 
                               last_target=target, confidence=0.98)

with pd.option_context('display.max_rows', None, 
                               'display.max_columns', None, 
                               'display.max_colwidth', 15,
                               'display.expand_frame_repr', False):
    print(path)    

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

750
Going from Zurich, Auzelg (8591049) to Zurich HB (8503000) in 25.14 minutes, departure at 12:04
             from         from_id              to           to_id  duration  total_duration departure_time   walk no_change mean_std_null      mean      std
0       Zurich HB         8503000       Zurich HB  8503000:0:4...  2.135259       23.000000          12:04   True     False          True  2.135259        0
1       Zurich HB  8503000:0:4...  Zurich Hard...     8503020:0:3  2.000000       21.000000          12:07  False     False         False  0.336499  1.09469
2  Zurich Hard...     8503020:0:3  Zurich Oerl...     8503006:0:8  5.000000       15.000000          12:09  False      True         False  5.000000        0
3  Zurich Oerl...     8503006:0:8      Glattbrugg     8503310:0:3  2.000000       10.063448          12:15  False      True         False  2.000000        0
4      Glattbrugg     8503310:0:3  Glattbrugg,...         8590620  3.063448        7.000000          12:19   True  

In [106]:
source = '8582462'
target = '8572602'

day_id, arrival_hour, arrival_minute = 4, 12, 30
print('Arrival time {}'.format(arrival_hour*60+arrival_minute))

path = dijkstra_reversed(mini_graph, source, arrival_hour*60+arrival_minute, 
                               last_target=target, confidence=0.98)
print(path)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Arrival time 750
-750
('8582462', '8572602')
('8572602', '8582462')
Source was last target
([('8572602', '8502553', {'std': 0, 'walk': False, 'time': 583.0, 'duration': 2.0, 'trip_id': '1.TA.1-231-j19-1.1.H', 'departure_time': -583.0, 'mean': 0}), ('8502553', '8582462', {'std': 0, 'walk': False, 'time': 578.0, 'duration': 2.0, 'trip_id': '1.TA.1-231-j19-1.2.H', 'departure_time': -578.0, 'mean': 0})], -578.0)

In [22]:
from geopy.distance import distance as geo_distance
import networkx as nx
from geopy.distance import distance as geo_distance
from pyspark.sql import Row
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType
from networkx.algorithms.shortest_paths.weighted import dijkstra_path

from heapq import heappush, heappop
from itertools import count

def zurich_distance(x, y):
    """zurich_distance: returns the distance of a station to Zurich HB
    @input: (lat,lon) of a station
    @output: distance in km to Zurich HB
    """
    return geo_distance(47.3781762039461, 8.54019357578468, (x,y)).km

# Filter stop_times to be only in 08:00-19:59:
stop_times = stop_times.where((col('departure_time') >= '08:00:00') 
                              & (col('departure_time') <= '19:59:59'))

# filter stops:
stops_distance = stops.rdd.map(lambda x: (x['stop_id'], zurich_distance(x['stop_lat'], x['stop_lon'])))
stops_distance = spark.createDataFrame(stops_distance.map(lambda r: Row(stop_id=r[0], 
                                                                        zurich_distance=r[1])))

stops_distance = stops_distance.filter(col('zurich_distance') <= 15)

# add distance to HB to stops info and keep only in radius of 15km
stops_zurich = stops_distance.join(stops, on='stop_id')

# keep only stop times in radius of 15km of Zurich
stop_times_zurich = stop_times.join(stops_distance.select('stop_id'), on='stop_id')

## create a simple graph: 
trip_id = '1.TA.1-231-j19-1.1.H'

#select four stops : 
stops_minig = ['8582462','8572600','8572601','8502553']

stop_times_info = stop_times_zurich.where((col('stop_id').isin(stops_minig))&(col('trip_id') == trip_id))

stops_info = stops_zurich.where(col('stop_id').isin(stops_minig))


mini_graph = nx.MultiDiGraph()

mini_nodes = stops_info.rdd.map(lambda r: (r[0], {'name': r['stop_name'],
                                              'lat': r['stop_lat'],
                                              'lon': r['stop_lon']})).collect()
mini_graph.add_nodes_from(mini_nodes)


# add artificial edge: 
mini_graph.add_edges_from([('8572600',  '8582462', {'duration': 2.0, 'time': 578.0, 'trip_id':  '1.TA.1-231-j19-1.1.H', 'mean':0, 'std':0}), 
                          ('8572600',  '8582462', {'duration': 1.0, 'time': 578.0, 'trip_id':  '1.TA.1-231-j19-1.1.H', 'mean':0, 'std':0}),
                          ( '8572601',  '8572600',{'duration': 0.0, 'time': 579.0, 'trip_id':  '1.TA.1-231-j19-1.1.H', 'mean':0, 'std':0}),
                          ( '8502553',  '8572601',{'duration': 4.0, 'time': 579.0, 'trip_id':  '1.TA.1-231-j19-1.1.H', 'mean':0, 'std':0}),
                          ( '8572602',  '8502553',{'duration': 2.0, 'time': 583.0, 'trip_id':  '1.TA.1-231-j19-1.1.H', 'mean':0, 'std':0}),])

mini_graph.add_edges_from([('8502553',  '8582462', {'duration': 2.0, 'time': 578.0, 'trip_id':  '1.TA.1-231-j19-1.2.H', 'mean':0, 'std':0})])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[0]

In [35]:
day_id, arrival_hour, arrival_minute = 4, 12, 30
print('Arrival time {}'.format(arrival_hour*60+arrival_minute))

path = dijkstra_reversed(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049',
                                confidence = 0.98)
print(path)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Arrival time 750
Going from Zurich, Auzelg (8591049) to Zurich HB (8503000) in 25.14 minutes, departure at 12:04
                         from          from_id  ...      mean      std
0                   Zurich HB          8503000  ...  2.135259        0
1                   Zurich HB  8503000:0:41/42  ...  0.336499  1.09469
2           Zurich Hardbrucke      8503020:0:3  ...  5.000000        0
3             Zurich Oerlikon      8503006:0:8  ...  2.000000        0
4                  Glattbrugg      8503310:0:3  ...  3.063448        0
5         Glattbrugg, Bahnhof          8590620  ...  0.639793  1.40383
6  Glattbrugg, Lindberghplatz          8590626  ...  0.829374  1.32294
7        Glattpark, Glattpark          8591830  ...  0.891561   1.4187
8       Zurich, Fernsehstudio          8591128  ...  0.956600  1.35904

[9 rows x 12 columns]

In [74]:
best_path1 = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049',
                                confidence = 0.98, departure_time = 717.0)
with pd.option_context('display.max_rows', None, 
                               'display.max_columns', None, 
                               'display.max_colwidth', 15,
                               'display.expand_frame_repr', False):
    print(best_path1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich HB (8503000) to Zurich, Auzelg (8591049) in 47.47 minutes, departure at 11:57
             from       from_id              to         to_id   duration  total_duration departure_time   walk no_change mean_std_null       mean      std
0       Zurich HB       8503000       Zurich HB  8503000:0:31   4.164130        4.164130          11:57   True     False          True   4.164130        0
1       Zurich HB  8503000:0:31  Zurich Oerl...   8503006:0:3   5.000000       10.000000          12:02  False     False         False   5.000000        0
2  Zurich Oerl...   8503006:0:3  Zurich Oerl...       8591063   5.930877       15.930877          12:07   True     False          True   5.930877        0
3  Zurich Oerl...       8591063  Zurich, Leu...       8591256   2.000000       18.000000          12:13  False     False         False   1.405997  2.92055
4  Zurich, Leu...       8591256  Zurich, Hag...       8591172   1.000000       22.000000          12:18  False     False         

In [26]:
best_path2 = dijkstra_with_time(graph, '8503610', arrival_hour*60+arrival_minute, last_target='8591057', departure_time = 717.0)
with pd.option_context('display.max_rows', None, 
                               'display.max_columns', None, 
                               'display.max_colwidth', 15,
                               'display.expand_frame_repr', False):
    print(best_path2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich, Triemli (8503610) to Zurich Altstetten, Bahnhof N (8591057) in 52.36 minutes, departure at 11:57
             from  from_id              to    to_id   duration  total_duration departure_time   walk no_change mean_std_null       mean       std
0  Zurich, Tri...  8503610  Zurich, Sch...  8580912   1.000000        3.000000          11:59  False     False         False   0.788371  1.243114
1  Zurich, Sch...  8580912  Zurich, Im Gut  8591208   1.000000        6.000000          12:02  False     False         False   1.400469  1.407420
2  Zurich, Im Gut  8591208  Zurich, Hub...  8591203   1.000000       12.000000          12:08  False     False         False   1.388227  1.704782
3  Zurich, Hub...  8591203  Zurich, Alb...  8591035   2.000000       19.000000          12:14  False     False         False   1.843465  1.595908
4  Zurich, Alb...  8591035  Zurich, Flu...  8591134   1.000000       25.000000          12:21  False     False         False   1.769857  1.581397
5  Zuric

#### Arrival time:

In [27]:
def convertToMinute(s):
    h, m = s.split(':')
    h,m = int(h), int(m)
    
    return h*60+m

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
decrements = [10, 5, 2, 1, 0]
source = '8503000'
target = '8591122'
day_id, arrival_hour, arrival_minute = 4, 12, 30

arrival_time = arrival_hour*60+arrival_minute


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
decrements = [10, 5, 2, 1, 0]
source = '8503000'
target = '8591122'
day_id, arrival_hour, arrival_minute = 4, 12, 30

arrival_time = arrival_hour*60+arrival_minute

def depart_time(arrival_time, source, target, decrements):
    #start with two hours before:
    dep = arrival_time - MAX_TRIP_DURATION*60
    path = dijkstra_with_time(graph, source, arrival_hour*60+arrival_minute, 
                               last_target=target, confidence=0.98, departure_time = dep)
    num_edges = len(path['total_duration'])
    duration = path['total_duration'][num_edges-1]

    first_arrival = convertToMinute(path['departure_time'][num_edges-1]) + path['duration'][num_edges-1]
    arrival_diff = arrival_time - first_arrival
    
    departures = [dep]
    
    if arrival_diff < 0:
        raise ErrorValue('Arrives after wanted arrival time')
    for i in decrements:
        dep = arrival_time - duration - i
        path = dijkstra_with_time(graph, source, arrival_hour*60+arrival_minute, 
                               last_target=target, confidence=0.98, departure_time = dep)
        num_edges = len(path['total_duration'])
        duration = path['total_duration'][num_edges-1]

        first_arrival = convertToMinute(path['departure_time'][num_edges-1]) + path['duration'][num_edges-1]
        arrival_diff = arrival_time - first_arrival
        
        if arrival_diff < 0:
            return departures[0]
        
        departures.insert(0, dep)

        print('Start with {} minutes difference'.format(arrival_diff))
        print('Trip takes {} minutes'.format(duration))

        
    return departures[0]

dep = depart_time(arrival_time, source, target, decrements)
dep

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich HB (8503000) to Zurich, ETH Honggerberg (8591122) in 27.00 minutes, departure at 10:30
             from  from_id              to    to_id   duration  total_duration departure_time   walk no_change mean_std_null      mean      std
0       Zurich HB  8503000  Zurich, Hal...  8591174   9.300864        9.300864          10:30   True     False          True  9.300864        0
1  Zurich, Hal...  8591174  Zurich, ETH...  8591122  13.000000       27.000000          10:44  False     False         False  1.210631  1.28324
Going from Zurich HB (8503000) to Zurich, ETH Honggerberg (8591122) in 34.00 minutes, departure at 11:53
             from  from_id              to    to_id   duration  total_duration departure_time   walk no_change mean_std_null      mean      std
0       Zurich HB  8503000  Zurich, Hal...  8591174   9.300864        9.300864          11:53   True     False          True  9.300864        0
1  Zurich, Hal...  8591174  Zurich, ETH...  8591122  13.000000       3

In [30]:
dijkstra_with_time(graph, source, arrival_hour*60+arrival_minute, 
                               last_target=target, confidence=0.98, departure_time = 717.0)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Going from Zurich HB (8503000) to Zurich, ETH Honggerberg (8591122) in 30.00 minutes, departure at 11:57
             from  from_id              to    to_id   duration  total_duration departure_time   walk no_change mean_std_null      mean      std
0       Zurich HB  8503000  Zurich, Hal...  8591174   9.300864        9.300864          11:57   True     False          True  9.300864        0
1  Zurich, Hal...  8591174  Zurich, ETH...  8591122  13.000000       30.000000          12:14  False     False         False  1.182748  1.58472
                from  from_id  ...      mean      std
0          Zurich HB  8503000  ...  9.300864        0
1  Zurich, Haldenegg  8591174  ...  1.182748  1.58472

[2 rows x 12 columns]

#### Validation:

##### Feasible paths:
Create a function that looks through a path to see if it is valid. 
So it looks for:
- missed connections
- transfer time of less than 2 minutes between two transports

In [31]:
"""Returns true if there is time to take all edges, and if 
when chaning from a connection to another you have at least 2 minutes. """

def is_path_valid(path):
    last_target = path['from_id'][len(path['from_id'])-1]
    time = convertToMinute(path['departure_time'][0]) + path['duration'][0]
    
    for i in range(1, len(path['from_id'])):
        #in case an edge taken actually left before we got there (only for transport edges, not for walks)
        if not path['walk'][i] and convertToMinute(path['departure_time'][i]) < time:
            print('You miss this connection. Time is {} while this edge leaves at {} from {} to {}'\
                  .format(minute_to_string(time), path['departure_time'][i], path['from'][i], path['to'][i]))
            return False
        
        #in case of change type transport -> trasnport need 2 minutes transfer:
        if not path['no_change'][i] and not path['walk'][i]:
            if not path['walk'][i-1]:
                if convertToMinute(path['departure_time'][i]) < time + 2:
                    print('You do not have time to change to this connection between {} to {} leaving at {}. You arrive at {} and need at least 2 min transfer'\
                          .format(path['from'][i],path['to'][i], path['departure_time'][i], minute_to_string(time)))
                    return False
        
        else: 
            time = convertToMinute(path['departure_time'][i]) + path['duration'][i]
    return True

# test of is path valid:
#assert(is_path_valid(best_path1))
#assert(is_path_valid(best_path2))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

##### Validate a path:
Then for a given path, we sample felays for transfers where we go from a transport -> walk or transport -> transport. 

For transport 1 -> transport 2: the delay of transport 1 will be added to its trip duration
For transport -> walk: the delay of transport will be added to the departure time of walk 

After modifying these values, we check whether the path is still feasible. We repeat this operation a ceertain number of times and report the percentage of feasible paths. 

For the moment, delays are sampled from an absolute normal distribution (**?good?**). 

In [32]:
def validate_path_(path, confidence):
    num_tries = 10
    num_valids = 0
    
    for i in range(num_tries):
        path_copy = path.copy()
        for i in range(len(path['from_id'])):
            #only for transfers etiher to other trains or to walking: 
            if i > 1 and not path['no_change'][i]:
                mean = path['mean'][i-1]
                std = path['std'][i-1]
                #sample a delay:
                #delay = compute_delay_uncertainty(mean, std, confidence)
                
                # calcluate delay for connection of before:
                if std != 0:
                    
                    delay = np.random.normal(mean, std)
                    if delay <0:
                        delay = 0
                else: delay = 0
                
                # if its between two transports we just add it to trip duration:
                if not path['walk'][i] and not path['walk'][i-1]:
                    print('Delay of {} for {} to {}'.format(delay, path_copy['from'][i-1], path_copy['to'][i-1]))
                    path_copy['duration'][i-1] += delay
                
                # transfer from trans to walk:
                if not path['walk'][i-1] and path['walk'][i]:
                    # if a train to a walk is delayed, the walk needs to leave later:
                    #need to leave at the time it takes for the delayed connection to arrive, 
                    # so if delayed need to start walking later: 
                    
                    if delay !=0:
                        print('Delay of {} for {} to {}, need to start walking later from {}'\
                                  .format(delay, path_copy['from'][i-1], path_copy['to'][i-1], path_copy['to'][i-1]))
                    
                    #add duration to transp: 
                    path_copy['duration'][i-1] += delay
                    
                    #delay the start of walk:
                    arrival_of_edge_before = path_copy['duration'][i-1]+convertToMinute(path_copy['departure_time'][i-1])
                    
                    # need to start later:
                    new_dep_time = minute_to_string(arrival_of_edge_before)
                    path_copy['departure_time'][i] = new_dep_time
        
        if is_path_valid(path_copy):
            num_valids += 1
    return num_valids/float(num_tries)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [33]:
# Test pour voir si on peut rater une connection: 
test = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591122', confidence=0.98, durations_dicts=durations_dicts)
test['mean'][10] = 6
test['std'][10] = 4
validate_path_(test, 0.95)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
name 'durations_dicts' is not defined
Traceback (most recent call last):
NameError: name 'durations_dicts' is not defined



## Create duration dictionnaries if needed

Code commented, don't have the permission to **change** a file, can write if put another path

durations_dicts = {}
edge_and_data_tuple = zip(graph.edges(keys=True),
              map(lambda x: x[2], graph.edges(data=True)))
edge_and_data_tuple = filter(lambda x: 'mean' in x[1] and 'std' in x[1], edge_and_data_tuple)
for c in [0.25, 0.5, 0.75, 0.9, 0.95, 0.98, 0.99]:
    durations_dicts[c] = {e: {'duration': data['mean'] + compute_delay_uncertainty(data['mean'], 
                                                                                            data['std'], 
                                                                                            c)
                                       if data['mean'] != None and data['std'] != None
                                       else data['duration']
                                      } for e, data in edge_and_data_tuple}
    
""" Conversion to json """
durations_dicts_for_json = {}
for c in durations_dicts.keys():
    durations_dicts_for_json[c] = {str(k): v for k, v in durations_dicts[c].items()}

print('Length of json:', len(json.dumps(durations_dicts_for_json))) -> 30106955

""" Save to hdfs """
sc.parallelize([json.dumps(durations_dicts_for_json)]).coalesce(1).saveAsTextFile('/user/{}/durations_for_confidence_.json'.format(username))

## Run algorithm

In [34]:
# Tao's example (except for the departure time)
print('Without minimum confidence ->')
#best_path1 = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049',confidence = 0.98, durations_dicts=durations_dicts)
print('\nWith minimum confidence ->')
#best_path2 = dijkstra_with_time(graph, '8503000', arrival_hour*60+arrival_minute, last_target='8591049', confidence=0.98, durations_dicts=durations_dicts)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Without minimum confidence ->

With minimum confidence ->

In [35]:
# From Triemli to Altstetten
print('Without minimum confidence ->')
#best_path1 = dijkstra_with_time(graph, '8503610', arrival_hour*60+arrival_minute, last_target='8591057')
print('\nWith minimum confidence ->')
#best_path2 = dijkstra_with_time(graph, '8503610', arrival_hour*60+arrival_minute, last_target='8591057', confidence=0.95, durations_dicts=durations_dicts)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Without minimum confidence ->

With minimum confidence ->

## Cells to keep

In [36]:
# Weird attributes?
print(graph.get_edge_data('8503000:0:41/42', '8503020:0:3', 0))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

None

In [37]:
# Proportion of null mean or std in non-walking edges
(len(filter(lambda x: x[2]['mean'] == None or x[2]['std'] == None, filter(lambda x: 'mean' in x[2] and 'std' in x[2], graph.edges(data=True))))
 / float(len(filter(lambda x: 'mean' in x[2] and 'std' in x[2], graph.edges(data=True)))))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.046632828786368166