In [1]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("InlineDataSeeding").getOrCreate()

In [2]:
LANDING_ORDERS = "/tmp/landing/orders.json"
LANDING_CUSTOMERS = "/tmp/landing/customers.json"

In [3]:
print("STEP 1: Seeding inline data to landing (JSON) ...")

orders_rows = [
    (1, "C001", "2025-08-08 09:00:00", 12000, "placed"),
    (2, "C002", "2025-08-08 09:05:00",  4500, "placed"),
    (3, "C001", "2025-08-08 09:10:00", 22000, "cancelled"),
    (4, "C003", "2025-08-08 09:15:00",   800, "placed")
]
customers_rows = [
    ("C001", "Ananya", "Bengaluru"),
    ("C002", "Rahul",  "Hyderabad"),
    ("C003", "Meera",  "Pune")
]

from pyspark.sql import functions as F
from pyspark.sql import types as T

orders_schema = T.StructType([
    T.StructField("order_id",    T.IntegerType()),
    T.StructField("customer_id", T.StringType()),
    T.StructField("order_ts",    T.StringType()),
    T.StructField("amount",      T.IntegerType()),
    T.StructField("status",      T.StringType())
])
cust_schema = T.StructType([
    T.StructField("customer_id", T.StringType()),
    T.StructField("name",        T.StringType()),
    T.StructField("city",        T.StringType())
])

orders_df = (spark.createDataFrame(orders_rows, orders_schema)
             .withColumn("order_ts", F.to_timestamp("order_ts")))
customers_df = spark.createDataFrame(customers_rows, cust_schema)

orders_df.write.mode("overwrite").json(LANDING_ORDERS)
customers_df.write.mode("overwrite").json(LANDING_CUSTOMERS)

print("✅ Seeded landing JSON:")
print(f"  {LANDING_ORDERS}")
print(f"  {LANDING_CUSTOMERS}")

STEP 1: Seeding inline data to landing (JSON) ...
✅ Seeded landing JSON:
  /tmp/landing/orders.json
  /tmp/landing/customers.json


In [4]:
print("STEP 2: BRONZE Reading raw landing data (no transformations)")

bron_orders = spark.read.json (LANDING_ORDERS)

bron_customers = spark.read.json (LANDING_CUSTOMERS)

print("Bronze Orders - schema & sample:")

bron_orders.printSchema()

bron_orders.show(truncate=False)

print("Bronze Customers schema & sample:")

bron_customers.printSchema()

bron_customers.show(truncate=False)

STEP 2: BRONZE Reading raw landing data (no transformations)
Bronze Orders - schema & sample:
root
 |-- amount: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_ts: string (nullable = true)
 |-- status: string (nullable = true)

+------+-----------+--------+------------------------+---------+
|amount|customer_id|order_id|order_ts                |status   |
+------+-----------+--------+------------------------+---------+
|22000 |C001       |3       |2025-08-08T09:10:00.000Z|cancelled|
|800   |C003       |4       |2025-08-08T09:15:00.000Z|placed   |
|12000 |C001       |1       |2025-08-08T09:00:00.000Z|placed   |
|4500  |C002       |2       |2025-08-08T09:05:00.000Z|placed   |
+------+-----------+--------+------------------------+---------+

Bronze Customers schema & sample:
root
 |-- city: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)

+---------+-----------+------

In [5]:
# STEP 0 — Setup
# These variables store the paths for each stage of the pipeline

# Landing folders: raw files exactly as they arrive
LANDING_ORDERS    = "file:/tmp/dlt/landing/orders"
LANDING_CUSTOMERS = "file:/tmp/dlt/landing/customers"

# Silver folder: cleaned data in Delta format
DELTA_SILVER_PATH = "file:/tmp/delta/sil_orders"

# SQL table name pointing to the silver Delta folder
DELTA_TABLE_NAME  = "sil_orders_tbl"

print("We will store data in:")
print(f"Landing Orders folder:    {LANDING_ORDERS}")
print(f"Landing Customers folder: {LANDING_CUSTOMERS}")
print(f"Silver Delta folder:      {DELTA_SILVER_PATH}")
print(f"SQL Table name:           {DELTA_TABLE_NAME}")

We will store data in:
Landing Orders folder:    file:/tmp/dlt/landing/orders
Landing Customers folder: file:/tmp/dlt/landing/customers
Silver Delta folder:      file:/tmp/delta/sil_orders
SQL Table name:           sil_orders_tbl
