# 01_raw_ingestion
Purpose: Ingest CSV sample files into Bronze (Delta) using PySpark.
Author: Janak
Date: 2025-11-26


In [0]:
# Setup: adjust paths if needed
dbfs_sample_dir = "/FileStore/data/sample"  # Databricks path exposed under /dbfs/FileStore/data/sample
bronze_base = "/tmp/delta/bronze"          # Delta Bronze base path (change to /mnt/.. in prod)

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()


In [0]:
# Ingest helper
def ingest_csv_to_delta_databricks(filename: str, table_name: str, input_dir: str=dbfs_sample_dir, bronze_base_path: str=bronze_base):
    input_path = f"/dbfs{input_dir}/{filename}" if input_dir.startswith("/") else f"/dbfs/{input_dir}/{filename}"
    delta_path = f"{bronze_base_path}/{table_name}"
    print(f"Reading {input_path} -> writing Delta at {delta_path}")
    df = spark.read.option("header", True).option("inferSchema", False).csv(input_path)
    # to be safe: coerce strings then write
    df.write.format("delta").mode("overwrite").save(delta_path)
    return df, delta_path


In [0]:
files_to_ingest = [
    ("orders.csv", "orders"),
    ("customers.csv", "customers"),
    ("products.csv", "products"),
    ("payments.csv", "payments"),
    ("inventory.csv", "inventory"),
]

ingested = {}
for fname, tname in files_to_ingest:
    df, path = ingest_csv_to_delta_databricks(fname, tname)
    ingested[tname] = {"df": df, "delta_path": path}
    print(f"Ingested {tname}: rows={df.count()}, cols={len(df.columns)}")


In [0]:
for tname, meta in ingested.items():
    print("----", tname, "----")
    display(meta["df"].limit(5))


In [0]:
from delta import *
for tname, meta in ingested.items():
    delta_path = meta["delta_path"]
    df_loaded = spark.read.format("delta").load(delta_path)
    df_loaded.createOrReplaceTempView(tname + "_bronze")
    print(f"Registered temp view: {tname}_bronze")
