In [0]:
#unzip 
import zipfile
import io

ZIP_SOURCE_PATH = "abfss://divvycontainer@divvystorage1.dfs.core.windows.net/trips"
UNZIPPED_DEST_PATH = "abfss://divvycontainer@divvystorage1.dfs.core.windows.net/raw"
ARCHIVE_PATH = "abfss://divvycontainer@divvystorage1.dfs.core.windows.net/trips_archive"

dbutils.fs.mkdirs(ARCHIVE_PATH)

def extract_and_write_zip_files():
    zip_files = [
        f.path for f in dbutils.fs.ls(ZIP_SOURCE_PATH)
        if f.name.endswith('.zip')
    ]
    for zip_path in zip_files:
        # Read ZIP file as binary
        binary_df = spark.read.format("binaryFile").load(zip_path)
        zip_bytes = binary_df.collect()[0]['content']
        with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zip_ref:
            for contained_file in zip_ref.namelist():
                if contained_file.lower().endswith('.csv'):
                    csv_bytes = zip_ref.read(contained_file)
                    output_file_path = f"{UNZIPPED_DEST_PATH}/{contained_file}"
                    dbutils.fs.put(
                        output_file_path,
                        csv_bytes.decode("utf-8", errors='ignore'),
                        overwrite=True
                    )
        # Move processed zip to archive
        file_name = zip_path.split('/')[-1]
        archive_target = f"{ARCHIVE_PATH}/{file_name}"
        dbutils.fs.mv(zip_path, archive_target)

extract_and_write_zip_files()

In [0]:
#trip column type
df = spark.table("divvy.default.bronze_trip_data")
display(spark.createDataFrame(df.dtypes, ["column", "type"]))

In [0]:
#weather column type
df = spark.table("divvy.default.bronze_weather_data")
display(spark.createDataFrame(df.dtypes, ["column", "type"]))

In [0]:
from pyspark.sql.functions import (
    col, date_format, year, quarter, month, dayofmonth, dayofweek, 
    when, lit, expr
)
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, BooleanType

# Define date range
start_date = "2015-01-01"
end_date = "2030-12-31"

# Create a DataFrame with a sequence of dates
dates_df = spark.sql(f"""
  SELECT sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day) as date_seq
""").selectExpr("explode(date_seq) as Full_Date")

# Add columns
dim_date_df = dates_df \
    .withColumn("Date_Key", date_format(col("Full_Date"), "yyyyMMdd").cast(IntegerType())) \
    .withColumn("Year", year(col("Full_Date"))) \
    .withColumn("Quarter", quarter(col("Full_Date"))) \
    .withColumn("Month", month(col("Full_Date"))) \
    .withColumn("Month_Name", date_format(col("Full_Date"), "MMMM")) \
    .withColumn("Day_of_Month", dayofmonth(col("Full_Date"))) \
    .withColumn("Day_of_Week", dayofweek(col("Full_Date"))) \
    .withColumn("Day_Name", date_format(col("Full_Date"), "EEEE")) \
    .withColumn("Is_Weekend", when(col("Day_of_Week").isin([1,7]), lit(True)).otherwise(lit(False))) \
    .withColumn(
        "Season",
        when(month(col("Full_Date")).isin([12,1,2]), "Winter")
        .when(month(col("Full_Date")).isin([3,4,5]), "Spring")
        .when(month(col("Full_Date")).isin([6,7,8]), "Summer")
        .otherwise("Fall")
    )

# Reorder columns
dim_date_df = dim_date_df.select(
    "Date_Key", "Full_Date", "Year", "Quarter", "Month", "Month_Name",
    "Day_of_Month", "Day_of_Week", "Day_Name", "Is_Weekend", "Season"
)

# Create managed table
dim_date_df.write.mode("overwrite").saveAsTable("divvy.default.DIM_Date_day")

display(dim_date_df)

In [0]:
from pyspark.sql.functions import explode, sequence, to_date, date_format, col, lit, when

# Define date and hour range
start_date = "2015-01-01"
end_date = "2030-12-31"

# Create a DataFrame with a sequence of dates
dates_df = spark.sql(f"""
  SELECT sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day) as date_seq
""").selectExpr("explode(date_seq) as Full_Date")

# Create hour and minute bucket DataFrame
hours = spark.range(0, 24).withColumnRenamed("id", "Hour")
minute_buckets = spark.createDataFrame([(0,), (15,), (30,), (45,)], ["MinuteBucket"])

# Cross join dates, hours, and minute buckets
dim_date_hour_df = dates_df.crossJoin(hours).crossJoin(minute_buckets)

# Add Date_Key
dim_date_hour_df = dim_date_hour_df.withColumn(
    "Date_Key", date_format(col("Full_Date"), "yyyyMMdd").cast("int")
)

# IsRushHour: 7-9am and 16-18pm
dim_date_hour_df = dim_date_hour_df.withColumn(
    "IsRushHour",
    when(
        ((col("Hour").between(7, 9)) | (col("Hour").between(16, 18))),
        lit(True)
    ).otherwise(lit(False))
)

# Timeofday: Night (0-5), Morning (6-11), Afternoon (12-17), Evening (18-23)
dim_date_hour_df = dim_date_hour_df.withColumn(
    "Timeofday",
    when(col("Hour").between(0, 5), "Night")
    .when(col("Hour").between(6, 11), "Morning")
    .when(col("Hour").between(12, 17), "Afternoon")
    .otherwise("Evening")
)

# Select and reorder columns
dim_date_hour_df = dim_date_hour_df.select(
    "Date_Key", "Hour", "MinuteBucket", "IsRushHour", "Timeofday"
)

# Create managed table
dim_date_hour_df.write.mode("overwrite").saveAsTable("divvy.default.DIM_date_hour")

display(dim_date_hour_df)