In [0]:
#Create Date Dimension
import pyspark.sql.functions as F 
min_date=spark.sql("select date(min(date)) from silver_payment").collect()[0][0]
#print(min_date)
max_date=spark.sql("select date(max(date)) from silver_payment").collect()[0][0]
#print(max_date)
date_dim = spark.range(0, (max_date - min_date).days + 1) \
    .withColumn("date", F.date_add(F.lit(min_date), F.col("id").cast("integer"))) \
    .withColumn("day_of_week", F.dayofweek("date").cast("integer")) \
    .withColumn("day_of_month", F.dayofmonth("date").cast("integer")) \
    .withColumn("day_of_year", F.dayofyear("date").cast("integer")) \
    .withColumn("week_of_year", F.weekofyear("date").cast("integer")) \
    .withColumn("month", F.month("date").cast("integer")) \
    .withColumn("month_name", F.monthname("date")) \
    .withColumn("quarter", F.quarter("date").cast("integer")) \
    .withColumn("year", F.year("date").cast("integer"))

In [0]:
date_dim.write.format("delta").mode("overwrite").saveAsTable("date_dim")

  

In [0]:
#Create Payment Fact table 
spark.sql('''CREATE TABLE IF NOT EXISTS payment_fact USING DELTA AS 
          SELECT DISTINCT p.id AS payment_id, dd.id as date_id,amount,rider_id FROM silver_payment p
          join date_dim dd on dd.date=p.date
          ''')

In [0]:
spark.sql("Select * from payment_fact limit 2").show()

+----------+-------+------+--------+
|payment_id|date_id|amount|rider_id|
+----------+-------+------+--------+
|       264|   2038|   9.0|    1011|
|       516|   2922| 17.99|    1021|
+----------+-------+------+--------+



In [0]:
##Create Dim_Rider
spark.sql('''CREATE TABLE IF NOT EXISTS dim_rider USING DELTA AS 
          SELECT DISTINCT * FROM silver_rider''')

In [0]:
##Create Dim Station
spark.sql('''CREATE TABLE IF NOT EXISTS dim_station USING DELTA AS 
          SELECT DISTINCT  *  FROM silver_station''')

In [0]:
#Create Dim Ridetype table
spark.sql ('''
           CREATE TABLE IF NOT EXISTS dim_ride_type
USING DELTA
AS
SELECT
  ROW_NUMBER() OVER (ORDER BY rideable_type) AS ride_type_id,
  rideable_type
FROM
  silver_trip
GROUP BY
  rideable_type
  '''
           
           )

In [0]:
##Create Trip Fact Table
spark.sql ('''
           CREATE TABLE IF NOT EXISTS trip_fact USING DELTA AS 

          select t.trip_id ,t.rider_id,t.start_station_id,t.end_station_id,rt.ride_type_id,sdd.id as start_date_id,edd.id as end_date_id, started_at,ended_at,
           round(datediff (second, t.started_at, t.ended_at) / 60)  AS trip_duration,
            floor (months_between (t.started_at, r.birthday) / 12) as rider_age,r.is_member
from silver_trip t
join dim_rider r on t.rider_id=r.rider_id
join dim_station s on s.station_id=t.start_station_id
join dim_station se on se.station_id=t.end_station_id
join dim_ride_type rt on rt.rideable_type=t.rideable_type
join date_dim sdd on sdd.date=date(t.started_at)
join date_dim edd on edd.date=date(t.ended_at)

'''
           
           
           )

In [0]:
spark.sql ("select * from trip_fact limit 2").show()

+----------------+--------+----------------+--------------+------------+-------------+-----------+-------------------+-------------------+-------------+---------+---------+
|         trip_id|rider_id|start_station_id|end_station_id|ride_type_id|start_date_id|end_date_id|         started_at|           ended_at|trip_duration|rider_age|is_member|
+----------------+--------+----------------+--------------+------------+-------------+-----------+-------------------+-------------------+-------------+---------+---------+
|222BB8E5059252D7|   34062|    KA1503000064|         13021|           1|         3054|       3054|2021-06-13 09:48:47|2021-06-13 10:07:23|         19.0|       30|     true|
|1826E16CB5486018|    5342|    TA1306000010|         13021|           1|         3062|       3062|2021-06-21 22:59:13|2021-06-21 23:04:29|          5.0|       26|     true|
+----------------+--------+----------------+--------------+------------+-------------+-----------+-------------------+-----------------