In [0]:
%sql
DROP TABLE bronze_weather_data 

In [0]:
import zipfile
import io

ZIP_SOURCE_PATH = "abfss://divvycontainer@divvystorage1.dfs.core.windows.net/trips"
UNZIPPED_DEST_PATH = "abfss://divvycontainer@divvystorage1.dfs.core.windows.net/raw"
ARCHIVE_PATH = "abfss://divvycontainer@divvystorage1.dfs.core.windows.net/trips_archive"

dbutils.fs.mkdirs(ARCHIVE_PATH)

def extract_and_write_zip_files():
    zip_files = [
        f.path for f in dbutils.fs.ls(ZIP_SOURCE_PATH)
        if f.name.endswith('.zip')
    ]
    for zip_path in zip_files:
        # Read ZIP file as binary
        binary_df = spark.read.format("binaryFile").load(zip_path)
        zip_bytes = binary_df.collect()[0]['content']
        with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zip_ref:
            for contained_file in zip_ref.namelist():
                if contained_file.lower().endswith('.csv'):
                    csv_bytes = zip_ref.read(contained_file)
                    output_file_path = f"{UNZIPPED_DEST_PATH}/{contained_file}"
                    dbutils.fs.put(
                        output_file_path,
                        csv_bytes.decode("utf-8", errors='ignore'),
                        overwrite=True
                    )
        # Move processed zip to archive
        file_name = zip_path.split('/')[-1]
        archive_target = f"{ARCHIVE_PATH}/{file_name}"
        dbutils.fs.mv(zip_path, archive_target)

extract_and_write_zip_files()

In [0]:
# Trip Data
from pyspark.sql.types import StructType, StructField, StringType

trip_schema = StructType([
  StructField("ride_id", StringType(), True),
  StructField("rideable_type", StringType(), True),
  StructField("started_at", StringType(), True),
  StructField("ended_at", StringType(), True),
  StructField("start_station_name", StringType(), True),
  StructField("start_station_id", StringType(), True),
  StructField("end_station_name", StringType(), True),
  StructField("end_station_id", StringType(), True),
  StructField("start_lat", StringType(), True),
  StructField("start_lng", StringType(), True),
  StructField("end_lat", StringType(), True),
  StructField("end_lng", StringType(), True),
  StructField("member_casual", StringType(), True)
])

(spark.readStream
  .format("cloudFiles")
  .option("cloudFiles.format", "csv")
  .option("header", "true")
  .option("cloudFiles.schemaLocation", "dbfs:/mnt//trips/trip_data/schema")
  .option("cloudFiles.rescuedDataColumn", "_rescued_data")
  .option("cloudFiles.schemaEvolutionMode", "rescue")         
  .schema(trip_schema)
  .load("abfss://divvycontainer@divvystorage1.dfs.core.windows.net/trips") 
  .writeStream
  .option("checkpointLocation", "dbfs:/mnt/trips/trip_data/data")
  .trigger(availableNow=True)
  .toTable("bronze_trip_data")
).awaitTermination()

In [0]:
# Weather Data
from pyspark.sql.types import StructType, StructField, StringType

weather_schema = StructType([
  StructField("YEAR", StringType(), True),
  StructField("MO", StringType(), True),
  StructField("DY", StringType(), True),
  StructField("HR", StringType(), True),
  StructField("TEMP", StringType(), True),
  StructField("PRCP", StringType(), True),
  StructField("HMDT", StringType(), True),
  StructField("WND_SPD", StringType(), True),
  StructField("ATM_PRESS", StringType(), True),
  StructField("REF", StringType(), True)
])

(spark.readStream
  .format("cloudFiles")
  .option("cloudFiles.format", "csv")
  .option("header", "true") 
  .option("cloudFiles.schemaLocation", "dbfs:/mnt/weather/weather_data/schema")
  .option("cloudFiles.rescuedDataColumn", "_rescued_data")
  .option("cloudFiles.schemaEvolutionMode", "rescue")
  .schema(weather_schema)
  .load("abfss://divvycontainer@divvystorage1.dfs.core.windows.net/trips")
  .writeStream
  .option("checkpointLocation", "dbfs:/mnt/weather_data/data")
  .trigger(availableNow=True)
  .toTable("bronze_weather_data")
).awaitTermination()

In [0]:
%sql
-- Trip Data (silver layer)
CREATE OR REPLACE TABLE silver_ride_fact
(
  ride_key BIGINT GENERATED ALWAYS AS IDENTITY, -- Surrogate Key for the Fact Table
  ride_id STRING,
  rideable_type STRING,
  started_at TIMESTAMP,
  ended_at TIMESTAMP,
  start_station_id STRING, -- Keep as STRING based on data sample
  end_station_id STRING,   -- Keep as STRING based on data sample
  start_lat DOUBLE,
  start_lng DOUBLE,
  end_lat DOUBLE,
  end_lng DOUBLE,
  member_casual STRING
)
USING DELTA
COMMENT 'Curated fact table with correct data types and data quality checks';

-- Insert (or use MERGE INTO for updates/deduplication) the transformed data
INSERT INTO silver_ride_fact
SELECT
  ride_id,
  rideable_type,
  CAST(started_at AS TIMESTAMP) AS started_at,
  CAST(ended_at AS TIMESTAMP) AS ended_at,
  start_station_id,
  end_station_id,
  CAST(start_lat AS DOUBLE) AS start_lat,
  CAST(start_lng AS DOUBLE) AS start_lng,
  CAST(end_lat AS DOUBLE) AS end_lat,
  CAST(end_lng AS DOUBLE) AS end_lng,
  member_casual
FROM
  bronze_ride_data
-- Optional: Add a WHERE clause here for basic data quality filtering (e.g., non-null ride_id)
WHERE ride_id IS NOT NULL;