In [0]:
from pyspark.sql.functions import *

In [0]:
#paths
catalog='nyc_taxi'
schema='nyc_schema'
file_name="green_tripdata_2024-01.parquet"

raw_path=f'/Volumes/{catalog}/{schema}/raw_zone/{file_name}'
bronze_path=f'/Volumes/{catalog}/{schema}/bronze_zone'

In [0]:
# reading raw data
df_raw=spark.read.format('parquet').load(raw_path)
display(df_raw)

In [0]:
# schema validation
expected_cols=[ 'VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
    'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
    'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
    'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
    'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge']

actual_cols=df_raw.columns
missing_cols=set(expected_cols)-set(actual_cols)
unexpect_cols=set(actual_cols)-set(expected_cols)

print("Missing columns:", missing_cols)
print("Unexpected columns:", unexpect_cols)

In [0]:
# check essential column existence
required_columns = [
    'VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
    'RatecodeID', 'PULocationID', 'DOLocationID',
    'passenger_count', 'trip_distance', 'fare_amount', 'payment_type'
]

missing_columns = [col for col in required_columns if col not in df_raw.columns]
if missing_columns:
    print("Missing required columns:", missing_columns)
else:
    print("All required columns are present.")

In [0]:
# null check for logging purpose
null_counts = df_raw.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(f"{c}_null_count")
    for c in df_raw.columns
])
null_counts.show()

In [0]:
# adding metadata to bronze data for traceability and debugging
df_bronze=df_raw.withColumn('ingestion_date',current_date()) \
    .withColumn('source_file',df_raw['_metadata.file_path'])
df_bronze.display()

In [0]:
# logging duplicates
total_count = df_bronze.count()
distinct_count = df_bronze.distinct().count()

print(f"Total rows: {total_count}")
print(f"Distinct rows: {distinct_count}")
print(f"Duplicate rows: {total_count - distinct_count}")

In [0]:
# writing bronze data to bronze zone
df_bronze.write.mode('overwrite').format('delta').saveAsTable('nyc_taxi.nyc_schema.bronze')