# Station Extractor
This notebook will extract all possible station names and save them. They are then used for autocompletion in the IO.

## Imports

In [2]:
import pandas as pd
import numpy as np
import math

In [3]:
import getpass
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("yarn") \
    .appName('sbb-{0}'.format(getpass.getuser())) \
    .config('spark.executor.memory', '8g') \
    .config('spark.executor.instances', '5') \
    .config('spark.port.maxRetries', '100') \
    .getOrCreate()
spark

## Build the distance filter and apply it
There is nothing special about this part, it's what we do everywhere:
- Get the operating points from didok
- Calculate their distance from Zurich Main Station
- Only keep the ones close enough
- Join with istdaten to get only stations

In [4]:
def distance_squared(n1,e1, n2, e2):
    '''Calculates the euclidean distance between two points'''
    eucl_dist2 = ((n1-n2)*(n1-n2)+ (e1-e2)*(e1-e2))
    return eucl_dist2

In [5]:
coords_zurich = (683144.0, 248040.0) # X, Y  (E,N)
didok_path = '/user/rychener/stops.txt'

In [6]:
didok = spark.read.csv(didok_path, sep=';', header=True, inferSchema=True).select('Dst-Bezeichnung-offiziell','KOORDE','KOORDN')\
.withColumnRenamed('Dst-Bezeichnung-offiziell','station_name')

In [7]:
didok = didok.withColumn('dist2', distance_squared(coords_zurich[1], coords_zurich[0], didok.KOORDN, didok.KOORDE))

In [9]:
didok_10km = didok.filter(didok.dist2<=10000**2).persist()
stops_filter = didok_10km.select('station_name')

In [10]:
swiss_data = spark.read.csv('/datasets/sbb/*/*/*.csv.bz2', header=True, sep=";")

In [11]:
istdaten = swiss_data.join(stops_filter, swiss_data.HALTESTELLEN_NAME == stops_filter.station_name, 'inner').persist()

In [13]:
istdaten.select('HALTESTELLEN_NAME').distinct().show(2)

+--------------------+
|   HALTESTELLEN_NAME|
+--------------------+
|Zürich, Stockerst...|
|Zürich, Tunnelstr...|
+--------------------+
only showing top 2 rows



In [14]:
stations = [row.HALTESTELLEN_NAME for row in istdaten.select('HALTESTELLEN_NAME').distinct().collect()]

In [19]:
import pickle
with open('./data/stations', 'wb') as f:
     pickle.dump(stations, f)