In [1]:
import pyspark
import os

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('journey-and-stations-data-transformer') \
        .config("spark.hadoop.fs.s3a.access.key", os.environ.get('AWS_ACCESS_KEY'))\
        .config("spark.hadoop.fs.s3a.secret.key", os.environ.get('AWS_SECRET_ACCESS_KEY'))\
        .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/27 11:52:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# get journey data
df_journey = spark.read.csv("s3a://hrc-de-data/raw/cycling-journey/*/*", inferSchema=True, header=True)

22/02/27 11:53:02 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [5]:
df_journey.take(2)

[Stage 4:>                                                          (0 + 1) / 1]                                                                                

[Row(Rental Id=109096951, Duration=540, Bike Id=13318, End Date='15/06/2021 20:19', EndStation Id=661, EndStation Name='All Saints Church, Portobello', Start Date='15/06/2021 20:10', StartStation Id=105, StartStation Name='Westbourne Grove, Bayswater'),
 Row(Rental Id=108982015, Duration=780, Bike Id=18991, End Date='13/06/2021 13:03', EndStation Id=312, EndStation Name="Grove End Road, St. John's Wood", Start Date='13/06/2021 12:50', StartStation Id=106, StartStation Name='Woodstock Street, Mayfair')]

In [6]:
df_journey.printSchema()

root
 |-- Rental Id: integer (nullable = true)
 |-- Duration: integer (nullable = true)
 |-- Bike Id: integer (nullable = true)
 |-- End Date: string (nullable = true)
 |-- EndStation Id: integer (nullable = true)
 |-- EndStation Name: string (nullable = true)
 |-- Start Date: string (nullable = true)
 |-- StartStation Id: integer (nullable = true)
 |-- StartStation Name: string (nullable = true)



In [20]:
from pyspark.sql import functions as F, types as T

In [8]:
# rename columns
df_journey= df_journey.withColumnRenamed('Rental Id', 'rental_id')\
.withColumnRenamed('Bike Id', 'bike_id')\
.withColumnRenamed('Start Date', 'start_date')\
.withColumnRenamed('End Date', 'end_date')\
.withColumnRenamed('StartStation Id', 'start_station')\
.withColumnRenamed('EndStation Id', 'end_station')

In [9]:
# drop unnecessary column
df_journey= df_journey.drop('Duration')

In [10]:
# add the weather_id column
df_journey= df_journey.withColumn('weather_date', F.to_date(df_journey.start_date))

In [11]:
df_journey.printSchema()

root
 |-- rental_id: integer (nullable = true)
 |-- bike_id: integer (nullable = true)
 |-- end_date: string (nullable = true)
 |-- end_station: integer (nullable = true)
 |-- EndStation Name: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- start_station: integer (nullable = true)
 |-- StartStation Name: string (nullable = true)
 |-- weather_date: date (nullable = true)



In [12]:
'''
we want to complete the stations data with some additional stations 
which are not present in the original stations data but are seen in some journey.
'''
df_processed_stations= spark.read.parquet('s3a://hrc-de-data/processed/cycling-extras/stations/')

                                                                                

In [13]:
df_processed_stations.tail(2)

                                                                                

[Row(station_id=838, station_name='Fore Street Avenue, Guildhall', longitude=-0.0914017, latitude=51.518093, easting=532524.0, northing=181634.0),
 Row(station_id=839, station_name='Sea Containers, South Bank', longitude=-0.1068403, latitude=51.507974, easting=531482.0, northing=180481.0)]

In [14]:
# create temporary table for both weather and journey
df_journey.createOrReplaceTempView('journey')
df_processed_stations.createOrReplaceTempView('stations')

In [15]:
additional_stations= spark.sql('''
select distinct(start_station) as station_id, `StartStation Name` as station_name 
from journey 
where start_station not in (select station_id from stations)
union
select distinct(end_station) as station_id, `EndStation Name` as station_name 
from journey 
where end_station not in (select station_id from stations)
''')
additional_stations.show()



+----------+--------------------+
|station_id|        station_name|
+----------+--------------------+
|       840|George Row, Bermo...|
|       391|Clifford Street, ...|
|       842|Temple Gardens, T...|
|       844|Canada Water Stat...|
|       845|Bermondsey Statio...|
|       841|Tower Wharf, Berm...|
+----------+--------------------+



                                                                                

In [16]:
# add columns to the additional stations to avoid errors when merging it to the previous one (df_processed_stations)
additional_stations= additional_stations.withColumn('longitude', F.lit(0))\
.withColumn('latitude', F.lit(0))\
.withColumn('easting', F.lit(0))\
.withColumn('northing', F.lit(0))

In [21]:
additional_stations = additional_stations.withColumn('longitude', additional_stations.longitude.cast(T.DoubleType()))\
.withColumn('latitude', additional_stations.latitude.cast(T.DoubleType()))\
.withColumn('easting', additional_stations.easting.cast(T.DoubleType()))\
.withColumn('northing', additional_stations.northing.cast(T.DoubleType()))

In [22]:
additional_stations.printSchema()

root
 |-- station_id: integer (nullable = true)
 |-- station_name: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- easting: double (nullable = true)
 |-- northing: double (nullable = true)



In [23]:
# make sure that the additional stations does not contain duplicated entries
additional_stations.dropDuplicates()

DataFrame[station_id: int, station_name: string, longitude: double, latitude: double, easting: double, northing: double]

In [None]:
# save stations data into parquet files in s3
additional_stations.write.parquet('s3a://hrc-de-data/processed/cycling-extras/stations/', mode='append')

In [None]:
# drop other unnecessary journey column
df_journey= df_journey.drop('StartStation Name', 'EndStation Name')

In [None]:
# save journey data into parquet files in s3
df_journey.write.parquet('s3a://hrc-de-data/processed/cycling-extras/journey/', mode='append')