## Local experimentation for Project 3
This notebook is used to experiment with the code for Project 3. The code is then copied to the main notebook for the project.

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import datediff, floor, col, date_format, date_add, lit, unix_timestamp

spark = SparkSession.builder \
    .appName("Project 3") \
    .getOrCreate()



In [3]:
# read the csv files. These files don't have column names, so we need to specify them according to the schema provided in the project description.
trip_columns= ["trip_id", "rideable_type", "start_at", "ended_at", "start_station_id", "end_station_id", "rider_id"]
df_trips = spark.read.csv('data/trips.csv', header=False)
df_trips = df_trips.toDF(*trip_columns)

station_columns = ["station_id", "name", "latitude", "longitude"]
df_stations = spark.read.csv('data/stations.csv', header=False)
df_stations = df_stations.toDF(*station_columns)

rider_columns = ["rider_id", "first", "last", "address", "birthday", "account_start_date", "account_end_date", "is_member"]
df_rider = spark.read.csv('data/riders.csv', header=False)
df_rider = df_rider.toDF(*rider_columns)

payment_columns = ["payment_id", "date", "amount", "rider_id"]
df_payments = spark.read.csv('data/payments.csv', header=False)
df_payments = df_payments.toDF(*payment_columns)


In [4]:
df_rider.show(5)

+--------+--------+---------+--------------------+----------+------------------+----------------+---------+
|rider_id|   first|     last|             address|  birthday|account_start_date|account_end_date|is_member|
+--------+--------+---------+--------------------+----------+------------------+----------------+---------+
|    1000|   Diana|    Clark| 1200 Alyssa Squares|1989-02-13|        2019-04-23|            null|     True|
|    1001|Jennifer|    Smith|     397 Diana Ferry|1976-08-10|        2019-11-01|      2020-09-01|     True|
|    1002|   Karen|    Smith|644 Brittany Row ...|1998-08-10|        2022-02-04|            null|     True|
|    1003|   Bryan|  Roberts|996 Dickerson Tur...|1999-03-29|        2019-08-26|            null|    False|
|    1004|   Jesse|Middleton|7009 Nathan Expre...|1969-04-11|        2019-09-14|            null|     True|
+--------+--------+---------+--------------------+----------+------------------+----------------+---------+


In [5]:
df_trips.show(5)

+----------------+-------------+-------------------+-------------------+----------------+--------------+--------+
|         trip_id|rideable_type|           start_at|           ended_at|start_station_id|end_station_id|rider_id|
+----------------+-------------+-------------------+-------------------+----------------+--------------+--------+
|89E7AA6C29227EFF| classic_bike|2021-02-12 16:14:56|2021-02-12 16:21:43|             525|           660|   71934|
|0FEFDE2603568365| classic_bike|2021-02-14 17:52:38|2021-02-14 18:12:09|             525|         16806|   47854|
|E6159D746B2DBB91|electric_bike|2021-02-09 19:10:18|2021-02-09 19:19:10|    KA1503000012|  TA1305000029|   70870|
|B32D3199F1C2E75B| classic_bike|2021-02-02 17:49:41|2021-02-02 17:54:06|             637|  TA1305000034|   58974|
|83E463F23575F4BF|electric_bike|2021-02-23 15:07:23|2021-02-23 15:22:37|           13216|  TA1309000055|   39608|
+----------------+-------------+-------------------+-------------------+----------------

In [6]:
df_stations.show(5)

+------------+--------------------+-----------------+------------------+
|  station_id|                name|         latitude|         longitude|
+------------+--------------------+-----------------+------------------+
|         525|Glenwood Ave & To...|        42.012701|-87.66605799999999|
|KA1503000012|  Clark St & Lake St|41.88579466666667|-87.63110066666668|
|         637|Wood St & Chicago...|        41.895634|        -87.672069|
|       13216|  State St & 33rd St|       41.8347335|       -87.6258275|
|       18003|Fairbanks St & Su...|41.89580766666667|-87.62025316666669|
+------------+--------------------+-----------------+------------------+


In [7]:
df_payments.show(5)

+----------+----------+------+--------+
|payment_id|      date|amount|rider_id|
+----------+----------+------+--------+
|         1|2019-05-01|   9.0|    1000|
|         2|2019-06-01|   9.0|    1000|
|         3|2019-07-01|   9.0|    1000|
|         4|2019-08-01|   9.0|    1000|
|         5|2019-09-01|   9.0|    1000|
+----------+----------+------+--------+


In [8]:
# Dim rider
dim_rider = df_rider \
    .withColumn("age_at_account_start", floor(datediff("account_start_date", "birthday") / 365)) \
    .select("rider_id", "address", "first", "last", "birthday", "is_member", "age_at_account_start")
dim_rider.show(5)



+--------+--------------------+--------+---------+----------+---------+--------------------+
|rider_id|             address|   first|     last|  birthday|is_member|age_at_account_start|
+--------+--------------------+--------+---------+----------+---------+--------------------+
|    1000| 1200 Alyssa Squares|   Diana|    Clark|1989-02-13|     True|                  30|
|    1001|     397 Diana Ferry|Jennifer|    Smith|1976-08-10|     True|                  43|
|    1002|644 Brittany Row ...|   Karen|    Smith|1998-08-10|     True|                  23|
|    1003|996 Dickerson Tur...|   Bryan|  Roberts|1999-03-29|    False|                  20|
|    1004|7009 Nathan Expre...|   Jesse|Middleton|1969-04-11|     True|                  50|
+--------+--------------------+--------+---------+----------+---------+--------------------+


In [13]:
# dim date

# Create a range of dates from 2012-01-01 to 2023-12-31
date_range_df = spark.range(0, (365 * 12) + 3).selectExpr("CAST(id AS INT) AS id") 
date_range_df = date_range_df \
    .withColumn("start_date", lit("2012-01-01")) \
    .withColumn("date", date_add("start_date", col("id"))) \
    .select("date")

# Add additional columns
dim_date = date_range_df \
    .withColumn("date_key", date_format("date", "yyyyMMdd").cast("int")) \
    .withColumn("week_day", date_format("date", "E")) \
    .withColumn("month", date_format("date", "M").cast("int")) \
    .withColumn("quarter", date_format("date", "q").cast("int")) \
    .drop("date")
    

dim_date.show(5)

+--------+--------+-----+-------+
|date_key|week_day|month|quarter|
+--------+--------+-----+-------+
|20120101|     Sun|    1|      1|
|20120102|     Mon|    1|      1|
|20120103|     Tue|    1|      1|
|20120104|     Wed|    1|      1|
|20120105|     Thu|    1|      1|
+--------+--------+-----+-------+


In [14]:
# fact payments
fact_payments = df_payments \
    .withColumn("date_key", date_format("date", "yyyyMMdd").cast("int")) \
    .withColumnRenamed('date', 'payment_date') \
    .withColumnRenamed('amount', 'payment_amount') \
    .select("rider_id", "payment_date", "date_key", "payment_amount")

fact_payments.show(5)

+--------+------------+--------+--------------+
|rider_id|payment_date|date_key|payment_amount|
+--------+------------+--------+--------------+
|    1000|  2019-05-01|20190501|           9.0|
|    1000|  2019-06-01|20190601|           9.0|
|    1000|  2019-07-01|20190701|           9.0|
|    1000|  2019-08-01|20190801|           9.0|
|    1000|  2019-09-01|20190901|           9.0|
+--------+------------+--------+--------------+


In [15]:
# fact trip
# join df_trips with df_rider

fact_trip = df_trips \
    .join(df_rider, "rider_id", "inner")
fact_trip = fact_trip \
    .withColumn("date_key", date_format("start_at", "yyyyMMdd").cast("int")) \
    .withColumn("start_timestamp", unix_timestamp("start_at")) \
    .withColumn("end_timestamp", unix_timestamp("ended_at")) \
    .withColumn('duration_in_minutes', ((col('end_timestamp') - col('start_timestamp')) / 60).cast("int")) \
    .withColumn("starting_hour", date_format("start_at", "H").cast("int")) \
    .withColumn('rider_age', floor(datediff("start_at", "birthday") / 365)) \
    .withColumnRenamed('start_at', 'started_at') \
    .select("trip_id","start_station_id", "end_station_id", "rider_id",
            "duration_in_minutes", "rider_age", "started_at", "ended_at", "date_key", "starting_hour")

fact_trip.show(5)

+----------------+----------------+--------------+--------+-------------------+---------+-------------------+-------------------+--------+-------------+
|         trip_id|start_station_id|end_station_id|rider_id|duration_in_minutes|rider_age|         started_at|           ended_at|date_key|starting_hour|
+----------------+----------------+--------------+--------+-------------------+---------+-------------------+-------------------+--------+-------------+
|89E7AA6C29227EFF|             525|           660|   71934|                  6|       37|2021-02-12 16:14:56|2021-02-12 16:21:43|20210212|           16|
|0FEFDE2603568365|             525|         16806|   47854|                 19|       38|2021-02-14 17:52:38|2021-02-14 18:12:09|20210214|           17|
|E6159D746B2DBB91|    KA1503000012|  TA1305000029|   70870|                  8|       33|2021-02-09 19:10:18|2021-02-09 19:19:10|20210209|           19|
|B32D3199F1C2E75B|             637|  TA1305000034|   58974|                  4|   

In [16]:
from pyspark.sql import functions as F
min_date = df_payments.select(F.min("date")).first()[0]
max_date = df_payments.select(F.max("date")).first()[0]
print(min_date, max_date)

2013-02-01 2022-02-01


In [17]:
min_date = date_range_df.select(F.min("date")).first()[0]
max_date = date_range_df.select(F.max("date")).first()[0]
print(min_date, max_date)

2012-01-01 2023-12-31
