In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
# read widget parameter (this will be empty if not backfill)
dbutils.widgets.text("file_path", "")
file_path = dbutils.widgets.get("file_path")

In [0]:
# new files detection in raw folder
files_info = dbutils.fs.ls("gs://nyc_taxi_analysis_project/raw/")
file_paths = [f.path for f in files_info if f.path.endswith(".parquet")]
processed_files = spark.table("nyc_taxi.bronze.metadata_processed_files")\
    .select("file_name").collect()
processed_file_names = [row.file_name for row in processed_files]
new_files = [f for f in file_paths if f not in processed_file_names]

In [0]:
# ingestion logic(ingest new files or backfill)
if file_path:
    print(f'backfill: {file_path}')
    file_list = [f.strip() for f in file_path.split(",")]
    files_to_ingest=[file_path]
else:    
    files_to_ingest = new_files

In [0]:
expected_schema = spark.table("nyc_taxi.bronze.data").schema

def align_schema(df, target_schema):
    for field in target_schema:
        if field.name in df.columns:
            df = df.withColumn(field.name, col(field.name).cast(field.dataType))
    return df

In [0]:
for fpath in files_to_ingest:
    bronze=spark.read.format('parquet').load(fpath)
    bronze = bronze.withColumn("pickup_year", year("lpep_pickup_datetime")) \
        .withColumn("pickup_month", month("lpep_pickup_datetime")) \
        .withColumn("source_file", col("_metadata.file_path")) \
        .withColumn("ingestion_time", current_timestamp())
    bronze=align_schema(bronze, expected_schema)
    bronze.write.mode('append').format('delta').partitionBy('pickup_year','pickup_month').saveAsTable('nyc_taxi.bronze.data')
    spark.createDataFrame([(fpath,)], ["file_name"]) \
         .write.mode("append").saveAsTable("nyc_taxi.bronze.metadata_processed_files")

In [0]:
%sql
select distinct * from nyc_taxi.bronze.metadata_processed_files