### Purpose
Ingest raw JSON files from ADLS using Auto Loader (cloudFiles) into Bronze Delta table (Append Only)

### Why AUTO LOADER
- Handles incremental file discovery
- Designed for cloud storage (ADLS/ S3/ GCS)
- Works in streaming mode so new files keep landing

### Create the bronze Table




In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS databricks_cata.price_movers;
USE CATALOG databricks_cata;
USE SCHEMA price_movers;

SELECT current_catalog(), current_schema()

In [0]:
# Storage Locations

raw_base_path = "abfss://source@giftmapote2ete.dfs.core.windows.net/databricks-price-movers/raw"

# Checkpoint path - where streaming checkpoints are stored
checkpoint_path = "abfss://source@giftmapote2ete.dfs.core.windows.net/databricks-price-movers/checkpoints/bronze_prices_raw"

# Table name (Bronze table)
bronze_table = "bronze_prices_raw"

print("raw_base_path: ", raw_base_path)
print("checkpoint_path: ", checkpoint_path)
print("bronze_table: ", bronze_table)

In [0]:
# Quick Validation to see if we can see raw files
dbutils.fs.ls(raw_base_path)

### Starting Auto Loader Stream into Bronze

In [0]:
spark.sql("USE CATALOG databricks_cata")
spark.sql("USE SCHEMA price_movers")

In [0]:
from pyspark.sql import functions as F

# Read new JSON files incrementally from cloud storage
raw_stream_df = (
  spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json")
    # Keep schema updates safe if you add fields later:
    .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
    # Store inferred schema in a persistent location
    .option("cloudFiles.schemaLocation", f"{checkpoint_path}/_schema")
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.rescuedDataColumn", "_rescued_data")
    # Read recursively under raw_base_path
    .load(raw_base_path)
    # Add ingestion metadata
    .withColumn("ingest_ts", F.current_timestamp())
    .withColumn("source_file", F.input_file_name())
)

query = (
  raw_stream_df
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", f"{checkpoint_path}/_checkpoint")
    .trigger(availableNow=True) # Ingest what exists and finish
    .toTable(bronze_table)
)

query.awaitTermination()
print("Bronze Load Complete !!")


### Confirm if the table exists

In [0]:
%sql
SELECT COUNT(*) FROM bronze_prices_raw;

-- SELECT ingest_ts, source_file, pulled_at_utc FROM bronze_prices_raw ORDER BY ingest_ts DESC LIMIT 10