In [1]:
import pyspark
import os
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('journey-and-stations-data-transformer') \
        .config("spark.hadoop.fs.s3a.access.key", os.environ.get('AWS_ACCESS_KEY'))\
        .config("spark.hadoop.fs.s3a.secret.key", os.environ.get('AWS_SECRET_ACCESS_KEY'))\
        .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/02 01:25:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# finally, save journey data into parquet files in s3
df_j= spark.read.csv('s3a://hrc-de-data/raw/cycling-extras/journey.csv', header=True, inferSchema=True)

22/03/02 01:25:14 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [3]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [6]:
# rename columns
df_j= df_j.withColumnRenamed('Rental Id', 'rental_id')\
.withColumnRenamed('Bike Id', 'bike_id')\
.withColumnRenamed('Start Date', 'start_date')\
.withColumnRenamed('End Date', 'end_date')\
.withColumnRenamed('StartStation Id', 'start_station')\
.withColumnRenamed('EndStation Id', 'end_station')

# convert data types
df_j= df_j.withColumn('start_date', to_timestamp(col('start_date'), 'dd/MM/yyy HH:mm'))

df_j= df_j.withColumn('end_date',  to_timestamp(col('end_date'), 'dd/MM/yyy HH:mm'))

# add weather_date column
df_j= df_j.withColumn('weather_date', to_date(col("start_date"), 'dd/MM/yyy HH:mm'))

In [7]:
df_j.show(5)
df_j.printSchema()

[Stage 2:>                                                          (0 + 1) / 1]

+---------+--------+-------+-------------------+-----------+--------------------+-------------------+-------------+--------------------+------------+
|rental_id|Duration|bike_id|           end_date|end_station|     EndStation Name|         start_date|start_station|   StartStation Name|weather_date|
+---------+--------+-------+-------------------+-----------+--------------------+-------------------+-------------+--------------------+------------+
|104820582|    1620|     22|2021-01-03 15:14:00|         11|Brunswick Square,...|2021-01-03 14:47:00|          542|Salmon Lane, Lime...|  2021-01-03|
|104816169|    1740|  10755|2021-01-03 13:26:00|        542|Salmon Lane, Lime...|2021-01-03 12:57:00|          546|New Fetter Lane, ...|  2021-01-03|
|104757113|    1620|  18908|2020-12-30 15:00:00|        239|Warren Street Sta...|2020-12-30 14:33:00|          779|Houndsditch, Aldgate|  2020-12-30|
|104749458|     780|  18499|2020-12-30 09:21:00|        766|Ram Street, Wands...|2020-12-30 09:08:00

                                                                                

In [8]:
# drop other unnecessary journey columns
df_j= df_j.drop('StartStation Name', 'EndStation Name', 'Duration')

In [9]:
df_j.show(10)

[Stage 3:>                                                          (0 + 1) / 1]

+---------+-------+-------------------+-----------+-------------------+-------------+------------+
|rental_id|bike_id|           end_date|end_station|         start_date|start_station|weather_date|
+---------+-------+-------------------+-----------+-------------------+-------------+------------+
|104820582|     22|2021-01-03 15:14:00|         11|2021-01-03 14:47:00|          542|  2021-01-03|
|104816169|  10755|2021-01-03 13:26:00|        542|2021-01-03 12:57:00|          546|  2021-01-03|
|104757113|  18908|2020-12-30 15:00:00|        239|2020-12-30 14:33:00|          779|  2020-12-30|
|104749458|  18499|2020-12-30 09:21:00|        766|2020-12-30 09:08:00|          653|  2020-12-30|
|104788389|  15668|2021-01-01 16:29:00|        655|2021-01-01 14:59:00|          655|  2021-01-01|
|104792584|   6695|2021-01-01 21:02:00|        682|2021-01-01 20:58:00|          655|  2021-01-01|
|104777428|   6695|2020-12-31 18:49:00|        655|2020-12-31 17:10:00|          655|  2020-12-31|
|104791339

                                                                                

In [10]:
df_j.write.parquet('s3a://hrc-de-data/processed/test/journey/', mode='append')

                                                                                

In [16]:
df_j.select('start_station').printSchema()


root
 |-- start_station: integer (nullable = true)

