# Create walking times:

This notebook computes the time it takes to walk between stations that are under 500m from each-other. 

In [2]:
%%configure
{"conf": {
    "spark.app.name": "dslab-group_final"
}}

A session has already been started. If you intend to recreate the session with new configurations, please include the -f argument.


In [17]:
stops = spark.read.format('orc').load('/data/sbb/timetables/orc/stops/000000_0')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Imports:

In [41]:
from geopy.distance import distance as geo_distance
from pyspark.sql import Row
from pyspark.sql.functions import col
from pyspark.sql.functions import udf

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Calculate the distance between all stops in order to select those under 500m distance (as the crow flies).

In [21]:
def zurich_distance(x, y):
    """zurich_distance: returns the distance of a station to Zurich HB
    @input: (lat,lon) of a station
    @output: distance in km to Zurich HB
    """
    zurich_pos = (47.3781762039461, 8.54019357578468)
    return geo_distance(zurich_pos, (x,y)).km

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
stops_distance = stops.rdd.map(lambda x: (x['stop_id'], zurich_distance(x['stop_lat'], x['stop_lon'])))
stops_distance = spark.createDataFrame(stops_distance.map(lambda r: Row(stop_id=r[0], 
                                                                        zurich_distance=r[1])))

stops_distance = stops_distance.filter(col('zurich_distance') <= 15)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [37]:
stops_pos = stops.join(stops_distance, 'stop_id').select(col('stop_id'), 
                                                         col('stop_lat'), col('stop_lon'))
stops_pos = stops_pos.select(col('stop_id').alias('stop_id_1'), 
                             col('stop_lat').alias('stop_lat_1'), 
                             col('stop_lon').alias('stop_lon_1'))
stops_pos = stops_pos.crossJoin(stops_pos.select(col('stop_id_1').alias('stop_id_2'), 
                                                 col('stop_lat_1').alias('stop_lat_2'), col('stop_lon_1').alias('stop_lon_2')))
stops_pos = stops_pos.where(col('stop_id_1') != col('stop_id_2'))
stops_pos.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+----------------+---------------+-----------+----------------+----------------+
|stop_id_1|      stop_lat_1|     stop_lon_1|  stop_id_2|      stop_lat_2|      stop_lon_2|
+---------+----------------+---------------+-----------+----------------+----------------+
|  8500926|47.4236270123012|8.4031825286317|    8502186|47.3934058321612|8.39894248049007|
|  8500926|47.4236270123012|8.4031825286317|8502186:0:1|47.3934666445388|8.39894248049007|
|  8500926|47.4236270123012|8.4031825286317|8502186:0:2|47.3935274568464|8.39894248049007|
+---------+----------------+---------------+-----------+----------------+----------------+
only showing top 3 rows

In [42]:
@udf("float")
def compute_distance(x1, y1, x2, y2):
    return geo_distance((x1, y1), (x2,y2)).m

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [43]:
stops_pos_dist = stops_pos.withColumn('distance', 
                                      compute_distance(col('stop_lat_1'), 
                                                       col('stop_lon_1'), 
                                                       col('stop_lat_2'), 
                                                       col('stop_lon_2')))
stops_pos_dist.cache()
stops_pos_dist.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+----------------+---------------+-----------+----------------+----------------+---------+
|stop_id_1|      stop_lat_1|     stop_lon_1|  stop_id_2|      stop_lat_2|      stop_lon_2| distance|
+---------+----------------+---------------+-----------+----------------+----------------+---------+
|  8500926|47.4236270123012|8.4031825286317|    8502186|47.3934058321612|8.39894248049007|3375.1602|
|  8500926|47.4236270123012|8.4031825286317|8502186:0:1|47.3934666445388|8.39894248049007|3368.4294|
|  8500926|47.4236270123012|8.4031825286317|8502186:0:2|47.3935274568464|8.39894248049007|3361.6992|
+---------+----------------+---------------+-----------+----------------+----------------+---------+
only showing top 3 rows

In [44]:
stops_pos_dist.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

3532520

In [45]:
walking_edges = stops_pos_dist.select(col('stop_id_1').alias('source'), col('stop_id_2').alias('target'), 
                                      col('distance'))\
                                        .where(col('distance') <= 500)\
                                        .withColumn('duration', col('distance')/50)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [46]:
walking_edges.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-----------+---------+-------------------+
| source|     target| distance|           duration|
+-------+-----------+---------+-------------------+
|8500926|    8590616|122.61607| 2.4523214721679687|
|8500926|    8590737| 300.6712|  6.013424072265625|
|8502186|8502186:0:1|6.7610297|0.13522059440612794|
+-------+-----------+---------+-------------------+
only showing top 3 rows

## Save to local:

In [80]:
%%spark -o walking_edges -n 1000000

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [81]:
%%local
walking_edges.to_pickle('walking_times.pickle')