In [None]:
import pandas as pd
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.appName("DeltaLab").getOrCreate()

# Read Parquet into Pandas
pdf = pd.read_parquet("yellow_tripdata_2023-01.parquet")

# LIMIT to 5000 rows in Pandas first (much faster!)
pdf_small = pdf.head(5000)

# Convert only 5000 rows to Spark DataFrame
df_small = spark.createDataFrame(pdf_small)

# Check schema and sample
df_small.printSchema()
df_small.show(5)

# Save locally (Delta Python API works without delta package)
df_small.write.save("silver/nyc_taxi_trips_delta_5000")  

print("✅ Delta table with 5000 records written successfully!")


In [5]:
df_verify = spark.read.load("silver/nyc_taxi_trips_delta_5000")
df_verify.show(5)
print(f"Total rows: {df_verify.count()}")


                                                                                

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2023-01-01 00:35:43|  2023-01-01 00:38:52|            2.0|          0.5|       1.0|                 N|         229|         233|           1|        5.1|  3.5|    0.5|       2.

In [7]:
# Read the Delta table (local version)
df_silver = spark.read.load("silver/nyc_taxi_trips_delta_5000")

# Show schema and sample data
df_silver.printSchema()
df_silver.show(5)


root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------

Step 7: Basic Transformations

Filter trips with passengers > 0

Calculate trip duration

Select only relevant columns for silver layer

In [37]:
from pyspark.sql.functions import col, unix_timestamp

# Add trip duration in minutes
df_silver_transformed = df_silver.filter(col("passenger_count") > 0).withColumn("trip_duration_minutes",\
                        (unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")) / 60).\
                        select("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime","passenger_count", "trip_distance", "fare_amount",
                                "tip_amount", "total_amount", "trip_duration_minutes")

df_silver_transformed.show(5)


+--------+--------------------+---------------------+---------------+-------------+-----------+----------+------------+---------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|fare_amount|tip_amount|total_amount|trip_duration_minutes|
+--------+--------------------+---------------------+---------------+-------------+-----------+----------+------------+---------------------+
|       1| 2023-01-01 00:35:43|  2023-01-01 00:38:52|            2.0|          0.5|        5.1|       2.5|        12.6|                 3.15|
|       1| 2023-01-01 00:40:58|  2023-01-01 00:52:22|            1.0|          2.2|       12.8|       2.5|        20.3|                 11.4|
|       1| 2023-01-01 00:57:39|  2023-01-01 01:14:12|            1.0|          4.1|       19.1|       4.8|        28.9|                16.55|
|       2| 2023-01-01 00:48:54|  2023-01-01 00:52:19|            1.0|         0.49|        5.1|       0.0|        10.1|   3.4166666666666665|
|     

Step 8: Write transformed silver table back as Delta

In [11]:
# Write transformed silver data without .format("delta")
df_silver_transformed.write.mode("overwrite").save("silver/nyc_taxi_trips_silver_transformed")

print("✅ Silver table with transformations written successfully!")


✅ Silver table with transformations written successfully!


26/01/05 19:45:20 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
26/01/05 19:45:20 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
26/01/05 19:45:20 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
26/01/05 19:45:20 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
26/01/05 19:45:20 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
26/01/05 19:45:20 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
26/01/05 19:45:20 WARN MemoryManager: Total allocation exceeds 95.

Step 9: Create the Gold Table (Aggregations / Business-level metrics)

Purpose:

The Gold layer contains aggregated or business-ready data.

For NYC taxi data, this could be daily revenue per zone, total trips per vendor, average tip, etc.

In [14]:
from pyspark.sql.functions import col, sum, avg, count, to_date

# Read the Silver table
silver_df = spark.read.parquet("silver/nyc_taxi_trips_delta_5000")

# Add a pickup_date column for aggregations
silver_df = silver_df.withColumn("pickup_date", to_date(col("tpep_pickup_datetime")))

# Aggregate: total trips, total fare, total tip per pickup location per day
gold_df = silver_df.groupBy("pickup_date", "PULocationID").agg(
        count("*").alias("total_trips"),
        sum("fare_amount").alias("total_fare"),
        sum("tip_amount").alias("total_tip"),
        avg("trip_distance").alias("avg_trip_distance")
    )

# Show sample
gold_df.show(5)

# Write Gold table (local Delta)
gold_df.write.mode("overwrite").save("gold/nyc_taxi_trips_gold")
print("✅ Gold table with aggregated metrics written successfully!")


+-----------+------------+-----------+------------------+------------------+------------------+
|pickup_date|PULocationID|total_trips|        total_fare|         total_tip| avg_trip_distance|
+-----------+------------+-----------+------------------+------------------+------------------+
| 2023-01-01|         137|         74|            1006.8|            200.24| 2.604054054054054|
| 2023-01-01|          12|          1|              12.1|              3.42|               1.9|
| 2023-01-01|         262|         61|             696.1|173.82000000000002|1.9636065573770491|
| 2023-01-01|          24|         15|224.13000000000002|             43.21|             2.704|
| 2023-01-01|         113|         56|            1025.5|213.61000000000004| 2.830535714285715|
+-----------+------------+-----------+------------------+------------------+------------------+
only showing top 5 rows
✅ Gold table with aggregated metrics written successfully!


Step 10: Query / Validate Gold Table / Build Simple Analytics

Purpose:

Verify the aggregated data is correct.

Example queries:

Top 5 zones by revenue

Average tip per day

Total trips per vendor

In [16]:
# Read Gold table
gold_df = spark.read.parquet("gold/nyc_taxi_trips_gold")

# Top 5 pickup locations by total fare
gold_df.orderBy(col("total_fare").desc()).show(5)

# Average tip per day across all zones
gold_df.groupBy("pickup_date").agg(avg("total_tip").alias("avg_tip_per_day")).show(5)

# Total trips per pickup location
gold_df.groupBy("PULocationID").agg(sum("total_trips").alias("total_trips")).orderBy(col("total_trips").desc()).show(5)


+-----------+------------+-----------+------------------+------------------+------------------+
|pickup_date|PULocationID|total_trips|        total_fare|         total_tip| avg_trip_distance|
+-----------+------------+-----------+------------------+------------------+------------------+
| 2023-01-01|         132|        256|           13985.4|1850.5800000000002|14.395195312499999|
| 2023-01-01|          79|        280|            4235.9| 898.9199999999998| 2.660535714285714|
| 2023-01-01|         161|        190|3450.8999999999996|            638.95|2.8206842105263155|
| 2023-01-01|         142|        189|            3267.1| 611.5400000000001| 2.835978835978836|
| 2023-01-01|          48|        123|           3000.41|            523.46| 3.505040650406504|
+-----------+------------+-----------+------------------+------------------+------------------+
only showing top 5 rows
+-----------+------------------+
|pickup_date|   avg_tip_per_day|
+-----------+------------------+
| 2023-01-01|

In [18]:
import subprocess

local_folder = "silver/nyc_taxi_trips_delta_5000"
s3_path = "s3://my-data-lake-lab-nandnioubt/silver/nyc_taxi_trips_delta_5000"

# Recursively copy local folder to S3
subprocess.run(["aws", "s3", "cp", local_folder, s3_path, "--recursive"], check=True)

print("✅ Folder uploaded to S3 successfully!")


upload: silver/nyc_taxi_trips_delta_5000/._SUCCESS.crc to s3://my-data-lake-lab-nandnioubt/silver/nyc_taxi_trips_delta_5000/._SUCCESS.crc
upload: silver/nyc_taxi_trips_delta_5000/.part-00000-6d76e298-18c3-47c6-8e84-813e70aac055-c000.snappy.parquet.crc to s3://my-data-lake-lab-nandnioubt/silver/nyc_taxi_trips_delta_5000/.part-00000-6d76e298-18c3-47c6-8e84-813e70aac055-c000.snappy.parquet.crc
upload: silver/nyc_taxi_trips_delta_5000/.part-00002-6d76e298-18c3-47c6-8e84-813e70aac055-c000.snappy.parquet.crc to s3://my-data-lake-lab-nandnioubt/silver/nyc_taxi_trips_delta_5000/.part-00002-6d76e298-18c3-47c6-8e84-813e70aac055-c000.snappy.parquet.crc
upload: silver/nyc_taxi_trips_delta_5000/.part-00001-6d76e298-18c3-47c6-8e84-813e70aac055-c000.snappy.parquet.crc to s3://my-data-lake-lab-nandnioubt/silver/nyc_taxi_trips_delta_5000/.part-00001-6d76e298-18c3-47c6-8e84-813e70aac055-c000.snappy.parquet.crc
upload: silver/nyc_taxi_trips_delta_5000/.part-00006-6d76e298-18c3-47c6-8e84-813e70aac055-c000

In [20]:
local_folder_gold = "gold/nyc_taxi_trips_gold"
s3_path_gold = "s3://my-data-lake-lab-nandnioubt/gold/nyc_taxi_trips_gold"

subprocess.run(["aws", "s3", "cp", local_folder_gold, s3_path_gold, "--recursive"], check=True)
print("✅ Gold folder uploaded to S3 successfully!")


upload: gold/nyc_taxi_trips_gold/.part-00000-0627d938-e96a-43fd-8a4d-9c04e70cc96e-c000.snappy.parquet.crc to s3://my-data-lake-lab-nandnioubt/gold/nyc_taxi_trips_gold/.part-00000-0627d938-e96a-43fd-8a4d-9c04e70cc96e-c000.snappy.parquet.crc
upload: gold/nyc_taxi_trips_gold/._SUCCESS.crc to s3://my-data-lake-lab-nandnioubt/gold/nyc_taxi_trips_gold/._SUCCESS.crc
upload: gold/nyc_taxi_trips_gold/_SUCCESS to s3://my-data-lake-lab-nandnioubt/gold/nyc_taxi_trips_gold/_SUCCESS
upload: gold/nyc_taxi_trips_gold/part-00000-0627d938-e96a-43fd-8a4d-9c04e70cc96e-c000.snappy.parquet to s3://my-data-lake-lab-nandnioubt/gold/nyc_taxi_trips_gold/part-00000-0627d938-e96a-43fd-8a4d-9c04e70cc96e-c000.snappy.parquet
✅ Gold folder uploaded to S3 successfully!
