In [0]:
#DIM DATE DAY TABLE

from pyspark.sql.functions import (
    col, date_format, year, quarter, month, dayofmonth, dayofweek, 
    when, lit, expr
)
from pyspark.sql.types import StructType, StructField, StringType, DateType, BooleanType

# Define date range
start_date = "2015-01-01"
end_date = "2030-12-31"

# Create a DataFrame with a sequence of dates
dates_df = spark.sql(f"""
  SELECT sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day) as date_seq
""").selectExpr("explode(date_seq) as Full_Date")

# Add columns
dim_date_df = dates_df \
    .withColumn("Date_Key", date_format(col("Full_Date"), "yyyyMMdd").cast("bigint")) \
    .withColumn("Year", year(col("Full_Date"))) \
    .withColumn("Quarter", quarter(col("Full_Date"))) \
    .withColumn("Month", month(col("Full_Date"))) \
    .withColumn("Month_Name", date_format(col("Full_Date"), "MMMM")) \
    .withColumn("Day_of_Month", dayofmonth(col("Full_Date"))) \
    .withColumn("Day_of_Week", dayofweek(col("Full_Date"))) \
    .withColumn("Day_Name", date_format(col("Full_Date"), "EEEE")) \
    .withColumn("Is_Weekend", when(col("Day_of_Week").isin([1,7]), lit(True)).otherwise(lit(False))) \
    .withColumn(
        "Season",
        when(month(col("Full_Date")).isin([12,1,2]), "Winter")
        .when(month(col("Full_Date")).isin([3,4,5]), "Spring")
        .when(month(col("Full_Date")).isin([6,7,8]), "Summer")
        .otherwise("Fall")
    )

# Reorder columns
dim_date_df = dim_date_df.select(
    "Date_Key", "Full_Date", "Year", "Quarter", "Month", "Month_Name",
    "Day_of_Month", "Day_of_Week", "Day_Name", "Is_Weekend", "Season"
)

# Create managed table with correct schema and primary key
dim_date_df.write \
    .option("overwriteSchema", "true") \
    .mode("overwrite") \
    .saveAsTable("divvy.default.DIM_Date_day")
display(dim_date_df)

In [0]:
%sql
--dim_date_day_pk
ALTER TABLE divvy.default.dim_date_day
ALTER COLUMN Date_Key SET NOT NULL;

ALTER TABLE divvy.default.dim_date_day
ADD CONSTRAINT dim_date_day_pk PRIMARY KEY (Date_Key)
NOT ENFORCED

In [0]:
#DIM DATE HOUR TABLE

from pyspark.sql.functions import explode, sequence, to_date, date_format, col, lit, when

# Define date and hour range
start_date = "2015-01-01"
end_date = "2030-12-31"

# Create a DataFrame with a sequence of dates
dates_df = spark.sql(f"""
  SELECT sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day) as date_seq
""").selectExpr("explode(date_seq) as Full_Date")

# Create hour and minute bucket DataFrame
hours = spark.range(0, 24).withColumnRenamed("id", "Hour")
minute_buckets = spark.createDataFrame([(0,), (15,), (30,), (45,)], ["MinuteBucket"])

# Cross join dates, hours, and minute buckets
dim_date_hour_df = dates_df.crossJoin(hours).crossJoin(minute_buckets)

# Add Date_Key
dim_date_hour_df = dim_date_hour_df.withColumn(
    "Date_Key", date_format(col("Full_Date"), "yyyyMMdd").cast("bigint")
)

# IsRushHour: 7-9am and 16-18pm
dim_date_hour_df = dim_date_hour_df.withColumn(
    "IsRushHour",
    when(
        ((col("Hour").between(7, 9)) | (col("Hour").between(16, 18))),
        lit(True)
    ).otherwise(lit(False))
)

# Timeofday: Night (0-5), Morning (6-11), Afternoon (12-17), Evening (18-23)
dim_date_hour_df = dim_date_hour_df.withColumn(
    "Timeofday",
    when(col("Hour").between(0, 5), "Night")
    .when(col("Hour").between(6, 11), "Morning")
    .when(col("Hour").between(12, 17), "Afternoon")
    .otherwise("Evening")
)

# Select and reorder columns
dim_date_hour_df = dim_date_hour_df.select(
    "Date_Key", "Hour", "MinuteBucket", "IsRushHour", "Timeofday"
)

# Create managed table
dim_date_hour_df.write.mode("overwrite").saveAsTable("divvy.default.DIM_date_hour")

display(dim_date_hour_df)

In [0]:
%sql
--dim_date_hour_pk
ALTER TABLE dim_date_hour
ALTER COLUMN Date_Key SET NOT NULL;

ALTER TABLE dim_date_hour
ADD CONSTRAINT dim_date_hour_pk PRIMARY KEY (Date_Key)
NOT ENFORCED

In [0]:
%skip
%sql
-- DIM Station

CREATE TABLE Dim_Station (
  Station_Key BIGINT GENERATED ALWAYS AS IDENTITY,
  Station_Name STRING,
  Latitude DOUBLE,
  Longitude DOUBLE,
  CONSTRAINT pk_Station_Key PRIMARY KEY (Station_Key)
);

In [0]:
%sql
DROP TABLE dim_station

In [0]:
%sql
-- DIM Station
CREATE TABLE dim_station
(
    -- SURROGATE KEY (PK) - Auto-incrementing
    Station_Key             BIGINT GENERATED ALWAYS AS IDENTITY,
    -- NATURAL KEY (for JOIN/MERGE)
    Station_ID_Natural      STRING NOT NULL COMMENT 'The original station ID from the source data.',
    -- 3. Descriptive Attributes (SCD Type 1 candidates)
    Station_Name            STRING,
    Latitude                DOUBLE,
    Longitude               DOUBLE,
    CONSTRAINT pk_station_key PRIMARY KEY (Station_Key)
)
USING DELTA;

In [0]:
%sql
CREATE TABLE Dim_Rider (
  Rider_key BIGINT GENERATED ALWAYS AS IDENTITY,
  Bike_Type STRING,
  CONSTRAINT pk_Rider_key PRIMARY KEY (Rider_key)
);

In [0]:
%sql
-- DIM Weather

CREATE OR REPLACE TABLE dim_weather (
    Weather_Key TIMESTAMP, 
    Temp_type_text STRING,
    Temp_type_bin DOUBLE,
    Humidity_categories_text STRING,
    Humidity_categories_bin DOUBLE,
    Precipitation_types_text STRING,
    Precipitation_types_bin DOUBLE,
    Wind_type_categories_text STRING,
    Wind_type_categories_bin DOUBLE,
    PRIMARY KEY (Weather_Key) NOT ENFORCED
)
USING DELTA;

In [0]:
%sql
DROP table fact_trip

In [0]:

# FACT TRIP TABLE

spark.sql("""
CREATE TABLE IF NOT EXISTS fact_trip (
  Trip_Key STRING,
  -- Foreign Keys
  Start_Date_Key BIGINT,
  Start_Date_Hour_Key BIGINT,
  End_Date_Key BIGINT,
  End_Date_Hour_Key BIGINT,
  Rider_Key BIGINT,
  Start_Station_Key BIGINT,
  End_Station_Key BIGINT,
  Weather_Key TIMESTAMP,
  -- Measures
  Trip_Duration_Minutes DOUBLE,
  Trip_Distance_Km DOUBLE,

  -- Primary Key
  CONSTRAINT pk_fact_trip PRIMARY KEY (Trip_Key) NOT ENFORCED,

  CONSTRAINT fk_start_date FOREIGN KEY (Start_Date_Key) REFERENCES dim_date_day(Date_Key) NOT ENFORCED,
  CONSTRAINT fk_start_hour FOREIGN KEY (Start_Date_Hour_Key) REFERENCES dim_date_hour(Date_Key) NOT ENFORCED,
  CONSTRAINT fk_end_date FOREIGN KEY (End_Date_Key) REFERENCES dim_date_day(Date_Key) NOT ENFORCED,
  CONSTRAINT fk_end_hour FOREIGN KEY (End_Date_Hour_Key) REFERENCES dim_date_hour(Date_Key) NOT ENFORCED,
  CONSTRAINT fk_rider FOREIGN KEY (Rider_Key) REFERENCES dim_rider(Rider_key) NOT ENFORCED,
  CONSTRAINT fk_start_station FOREIGN KEY (Start_Station_Key) REFERENCES dim_station(Station_Key) NOT ENFORCED,
  CONSTRAINT fk_end_station FOREIGN KEY (End_Station_Key) REFERENCES dim_station(Station_Key) NOT ENFORCED,
  CONSTRAINT fk_weather FOREIGN KEY (Weather_Key) REFERENCES dim_weather(Weather_Key) NOT ENFORCED
)
USING DELTA
""")