In [0]:
from pyspark.sql.functions import *

In [0]:
df_bronze=spark.table("nyc_taxi.nyc_schema.bronze")

In [0]:
df_bronze.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c+'_null_count') for c in df_bronze.columns]).display()

In [0]:
# dropping nulls
df_silver=df_bronze.drop('ehail_fee')
essential_cols=[
    "lpep_pickup_datetime",
    "lpep_dropoff_datetime",
    "trip_distance",
    "fare_amount",
    "total_amount"
]
df_silver=df_silver.dropna(subset=essential_cols)

In [0]:
print(df_silver.count(),df_bronze.count())

In [0]:
# filter invalid rows
df_silver=df_silver.filter(col("trip_distance")>0)\
    .filter(col('fare_amount')>0)\
    .filter(col('total_amount')>0)\
    .filter(col('lpep_pickup_datetime')< col('lpep_dropoff_datetime'))
df_silver.count()

In [0]:
df_silver.display(5)

In [0]:
# Type Casting and Enrichment
df_silver = df_silver.withColumn("pickup_date", to_date("lpep_pickup_datetime")) \
    .withColumn("pickup_hour", hour("lpep_pickup_datetime")) \
    .withColumn("trip_duration_minutes",
        (unix_timestamp("lpep_dropoff_datetime") - unix_timestamp("lpep_pickup_datetime")) / 60
    ) \
    .withColumn("fare_per_mile", round(col("fare_amount") / col("trip_distance"), 2)) \
    .withColumn("tip_percent", round(col("tip_amount") / col("fare_amount"), 2))

df_silver.select("pickup_date","pickup_hour","trip_duration_minutes","fare_per_mile","tip_percent").take(5)

In [0]:
df_silver.show(3)

In [0]:
#adding time based flags
df_silver=df_silver.withColumn("day_of_week",dayofweek('lpep_pickup_datetime'))\
    .withColumn("is_weekend",when(col("day_of_week").isin(1,7),True).otherwise(False))

In [0]:
# adding metadata :processing date
df_silver=df_silver.withColumn("processing_date",current_date())

In [0]:
# column reordering
final_col_order = [
    "VendorID", "lpep_pickup_datetime", "lpep_dropoff_datetime", "pickup_date", "pickup_hour","day_of_week", "is_weekend", "store_and_fwd_flag", "RatecodeID", "PULocationID", "DOLocationID","passenger_count", "trip_distance", "trip_duration_minutes", "fare_amount", "tip_amount","tip_percent", "tolls_amount", "extra", "mta_tax", "improvement_surcharge",
    "congestion_surcharge","total_amount", "fare_per_mile", "payment_type", "trip_type", "ingestion_date", "source_file", "processing_date"
]

df_silver = df_silver.select(final_col_order)

In [0]:
# drop duplicates
df_silver=df_silver.dropDuplicates()


In [0]:
#SILVER LAYER VALIDATION 
from pyspark.sql.types import DoubleType, TimestampType, IntegerType, BooleanType, DateType, StringType

# Schema Validation
expected_schema = {
    "VendorID": IntegerType(),
    "lpep_pickup_datetime": TimestampType(),
    "lpep_dropoff_datetime": TimestampType(),
    "pickup_date": DateType(),
    "pickup_hour": IntegerType(),
    "day_of_week": IntegerType(),
    "is_weekend": BooleanType(),
    "store_and_fwd_flag": StringType(),
    "RatecodeID": IntegerType(),
    "PULocationID": IntegerType(),
    "DOLocationID": IntegerType(),
    "passenger_count": IntegerType(),
    "trip_distance": DoubleType(),
    "trip_duration_minutes": DoubleType(),
    "fare_amount": DoubleType(),
    "tip_amount": DoubleType(),
    "tip_percent": DoubleType(),
    "tolls_amount": DoubleType(),
    "extra": DoubleType(),
    "mta_tax": DoubleType(),
    "improvement_surcharge": DoubleType(),
    "congestion_surcharge": DoubleType(),
    "total_amount": DoubleType(),
    "fare_per_mile": DoubleType(),
    "payment_type": IntegerType(),
    "trip_type": IntegerType(),
    "ingestion_date": DateType(),
    "source_file": StringType(),
    "processing_date": DateType()
}

for col_name, expected_type in expected_schema.items():
    actual_type = df_silver.schema[col_name].dataType
    if not isinstance(actual_type, type(expected_type)):
        print(f"[SCHEMA MISMATCH] Column '{col_name}' expected {expected_type} but found {actual_type}")

# Null Check on essential fields
essential_cols = ["lpep_pickup_datetime", "lpep_dropoff_datetime", "trip_distance", "fare_amount", "total_amount"]
null_summary = df_silver.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(f"{c}_nulls") for c in essential_cols])
null_summary.show()

# Duplicate Check (based on natural keys)
duplicate_check = df_silver.groupBy(
    "VendorID", "lpep_pickup_datetime", "DOLocationID", "PULocationID"
).count().filter("count > 1")
print(f" Duplicate records count: {duplicate_check.count()}")

# Value Range Check
df_silver.select("trip_distance", "fare_amount", "total_amount").summary("min", "max", "mean").show()


In [0]:
# casting 
df_silver = df_silver \
    .withColumn("RatecodeID", col("RatecodeID").cast(IntegerType())) \
    .withColumn("passenger_count", col("passenger_count").cast(IntegerType())) \
    .withColumn("payment_type", col("payment_type").cast(IntegerType())) \
    .withColumn("trip_type", col("trip_type").cast(IntegerType()))

In [0]:

df_silver.write.format('delta').mode('overwrite').saveAsTable('nyc_taxi.nyc_schema.silver')