# Walking Estimation

In [1]:
import getpass
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("yarn") \
    .appName('sbb-{0}'.format(getpass.getuser())) \
    .config('spark.executor.memory', '8g') \
    .config('spark.executor.instances', '5') \
    .config('spark.port.maxRetries', '100') \
    .getOrCreate()
spark

In [2]:
import numpy as np
import pandas as pd

## Step 1: Get all stations
First, we need to retain all the stations of interest (10km radius aruound Zürich)

In [3]:
def distance_squared(n1,e1, n2, e2):
    '''Calculates the euclidean distance between two points'''
    eucl_dist2 = ((n1-n2)*(n1-n2)+ (e1-e2)*(e1-e2))
    return eucl_dist2

In [4]:
coords_zurich = (683144.0, 248040.0) # X, Y  (E,N)
didok_path = '/user/rychener/stops.txt'

In [5]:
didok = spark.read.csv(didok_path, sep=';', header=True, inferSchema=True).select('Dst-Bezeichnung-offiziell','KOORDE','KOORDN')\
    .withColumnRenamed('Dst-Bezeichnung-offiziell','station_name')

In [6]:
didok = didok.withColumn('dist2', distance_squared(coords_zurich[1], coords_zurich[0], didok.KOORDN, didok.KOORDE))

In [7]:
didok_10km = didok.filter(didok.dist2<=10000**2).persist()

In [8]:
stations = spark.read.csv('/datasets/sbb/2017/10/2017-10-17istdaten.csv.bz2', header=True, sep=";")\
                        .select('HALTESTELLEN_NAME').distinct()
didok_10km = didok_10km.join(stations, stations.HALTESTELLEN_NAME==didok_10km.station_name)

In [9]:
didok_10km.count()

839

In [10]:
didok_10km.show()

+--------------------+---------+---------+--------------------+--------------------+
|        station_name|   KOORDE|   KOORDN|               dist2|   HALTESTELLEN_NAME|
+--------------------+---------+---------+--------------------+--------------------+
|Erlenbach ZH, Im ...|687568.16|240079.18|  8.29478467780004E7|Erlenbach ZH, Im ...|
|    Zürich, Klosbach| 685009.1|247186.18|   4207606.602399925|    Zürich, Klosbach|
| Zürich, Schauenberg|680618.06|252258.14|2.4173077943199836E7| Zürich, Schauenberg|
|     Zürich, Althoos|681919.07|251665.15|1.4642166027400084E7|     Zürich, Althoos|
|Zürich, Schmiede ...|681619.11|247152.16|  3113549.3777000364|Zürich, Schmiede ...|
|Rüschlikon, Weids...|684264.15|240023.14| 6.552478028209983E7|Rüschlikon, Weids...|
|  Urdorf, Oberurdorf|674363.05|248012.14| 7.710585908209918E7|  Urdorf, Oberurdorf|
|Zürich, Stockerst...|682812.11|246896.16|  1418520.9177000013|Zürich, Stockerst...|
|Zürich, Tunnelstr...|682627.11|246720.16|  2009152.8977000052|Zü

## Step 2: Calculate the distances between stations
Now, we calculate the distance between each station.
This is done by joining them, then calculating the distances.

In [11]:
columns = ['station_name', 'KOORDE', 'KOORDN']
df1 = didok_10km.select('station_name', 'KOORDE', 'KOORDN')
df2 = didok_10km.select('station_name', 'KOORDE', 'KOORDN')
for c in columns:
    df1 = df1.withColumnRenamed(c, c+'_1')
    df2 = df2.withColumnRenamed(c, c+'_2')

In [12]:
df1.show(2)
df2.show(2)

+--------------------+---------+---------+
|      station_name_1| KOORDE_1| KOORDN_1|
+--------------------+---------+---------+
|Erlenbach ZH, Im ...|687568.16|240079.18|
|    Zürich, Klosbach| 685009.1|247186.18|
+--------------------+---------+---------+
only showing top 2 rows

+--------------------+---------+---------+
|      station_name_2| KOORDE_2| KOORDN_2|
+--------------------+---------+---------+
|Erlenbach ZH, Im ...|687568.16|240079.18|
|    Zürich, Klosbach| 685009.1|247186.18|
+--------------------+---------+---------+
only showing top 2 rows



In [13]:
full_df = df1.crossJoin(df2)

In [14]:
distance_df = full_df.withColumn('distance', distance_squared(full_df.KOORDN_1, full_df.KOORDE_1, \
                                                              full_df.KOORDN_2, full_df.KOORDE_2))

In [15]:
distance_df.show(5)

+--------------------+---------+---------+--------------------+---------+---------+--------------------+
|      station_name_1| KOORDE_1| KOORDN_1|      station_name_2| KOORDE_2| KOORDN_2|            distance|
+--------------------+---------+---------+--------------------+---------+---------+--------------------+
|Zürich, Stockerst...|682812.11|246896.16|Zürich, Stockerst...|682812.11|246896.16|                 0.0|
|Zürich, Stockerst...|682812.11|246896.16|Zürich, Tunnelstr...|682627.11|246720.16|             65201.0|
|Zürich, Stockerst...|682812.11|246896.16|Zürich, Schmiede ...|681619.11|247152.16|           1488785.0|
|Zürich, Stockerst...|682812.11|246896.16| Zürich, Oerlikerhus|684183.07|252462.16|3.2859887321599897E7|
|Zürich, Stockerst...|682812.11|246896.16|Zürich Flughafen,...|685474.03|256362.17|  9.66911634065004E7|
+--------------------+---------+---------+--------------------+---------+---------+--------------------+
only showing top 5 rows



## Step 3: Build a numpy adjacency matrix for walking times
We assume $3.6km/h = 1m/s$ to be a reasonable walking speed.

In [16]:
stations = [r[0] for r in didok_10km.select('station_name').collect()]

In [17]:
station_from_index = dict(zip(range(len(stations)), stations))
index_from_station = dict(zip(stations,range(len(stations))))

In [18]:
df_pd = distance_df.toPandas()

In [19]:
adj = np.full((len(stations),len(stations)), -1)

In [20]:
for i, row in df_pd.iterrows():
    _i = index_from_station[row['station_name_1']]
    _j = index_from_station[row['station_name_2']]
    adj[_i,_j] = float(row['distance'])
    adj[_j,_i] = float(row['distance'])

In [21]:
adj

array([[        0,  57058237, 196630956, ...,  14634156, 100843493,
         23327944],
       [ 57058237,         0,  45006010, ...,  49241843,  15389723,
         44623201],
       [196630956,  45006010,         0, ..., 153197786,  53483096,
        134241520],
       ...,
       [ 14634156,  49241843, 153197786, ...,         0, 111716747,
          1245681],
       [100843493,  15389723,  53483096, ..., 111716747,         0,
        108502849],
       [ 23327944,  44623201, 134241520, ...,   1245681, 108502849,
                0]])

## Step 4: Saving the precomputed distances

In [22]:
# saving the numpy array of distances
np.save('./data/walking_distances', adj)

In [23]:
# saving the dictionaries:
import pickle

with open('./data/station_from_index.pkl', 'wb') as handle:
    pickle.dump(station_from_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./data/index_from_station.pkl', 'wb') as handle:
    pickle.dump(index_from_station, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Step 5: Usage in the Algorithm

In [24]:
# the necessary imports
import numpy as np

In [25]:
# first, load all the necessary data
import pickle
with open('./data/station_from_index.pkl', 'rb') as handle:
    station_from_index = pickle.load(handle)
with open('./data/index_from_station.pkl', 'rb') as handle:
    index_from_station = pickle.load(handle)
walking_distances = np.load('./data/walking_distances.npy')
walking_distances = np.sqrt(walking_distances)

# definition of the maximum walking time
MAX_WALKING_TIME = 15*60

In [26]:
def get_walking_stations(current_station, current_time):
    # get the distances for all stations
    walking_times = walking_distances[index_from_station[current_station],:]
    # get stations in walking distance
    close_stations = np.argwhere(walking_times<MAX_WALKING_TIME)
    # remove station itself
    close_stations = [i for i in close_stations.flatten().tolist()  \
                     if i!=index_from_station[current_station]]
    # get the names of the stations
    station_names = [station_from_index[i] for i in close_stations]
    # get the estimated arrival times for all stations
    arrival_times = (walking_times[close_stations].flatten() + current_time).tolist()
    # get the other information necessary
    line_type = [None]*len(station_names)
    departure_time = [current_time]*len(station_names)
    
    
    # return the tuples we want
    walking_stops_tuples = list(zip(line_type, departure_time, \
                                    station_names, arrival_times))
    return walking_stops_tuples

In [27]:
%%timeit
r = get_walking_stations('Zürich HB', 0)

39.1 µs ± 4.08 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [28]:
get_walking_stations('Zürich HB', 0)

[(None, 0, 'Zürich, Rathaus', 743.4097120699998),
 (None, 0, 'Zürich, Sihlquai/HB', 272.1157841801905),
 (None, 0, 'Zürich, Sihlstrasse', 781.4473750675729),
 (None, 0, 'Zürich, ETH/Universitätsspital', 677.36769925942),
 (None, 0, 'Zürich, Bahnhofquai/HB', 135.40679451194464),
 (None, 0, 'Zürich, Beckenhof', 657.8571273460523),
 (None, 0, 'Zürich, Ottikerstrasse', 798.3908817114584),
 (None, 0, 'Zürich, Winkelriedstrasse', 823.99939320366),
 (None, 0, 'Zürich, Bahnhofstrasse/HB', 179.9222054111165),
 (None, 0, 'Zürich, Löwenplatz', 328.6928657576857),
 (None, 0, 'Zürich, Sonneggstrasse', 582.5838995372254),
 (None, 0, 'Zürich, Kanonengasse', 747.2369102232572),
 (None, 0, 'Zürich, Stampfenbachplatz', 280.6830953228213),
 (None, 0, 'Zürich Selnau', 848.0678038930614),
 (None, 0, 'Zürich, Rudolf-Brun-Brücke', 482.8716185488644),
 (None, 0, 'Zürich, Haldenegg', 365.4216742340279),
 (None, 0, 'Zürich, Vogelsangstrasse', 898.0929795962109),
 (None, 0, 'Zürich, Haldenbach', 662.028700284209