# Dev/Test/Prod data set simulation in a Bronze layer

Creates t3_dev (1% sample), t3_test (10% sample), and t3 (alias of t3_dev) from t3_prod.
Also ensures t2 (diabetes) data is available for downstream Transformations notebook.

In [None]:
# Self-healing data load: ensure tables exist (copy job fallback)
import traceback

BLOB_T3_PROD = "wasbs://holidaydatacontainer@azureopendatastorage.blob.core.windows.net/Processed/"
BLOB_T2 = "wasbs://mlsamples@azureopendatastorage.blob.core.windows.net/diabetes/"

def table_exists(name):
    """Check if a table exists in the default lakehouse."""
    try:
        spark.table(name).limit(1).collect()
        return True
    except Exception:
        return False

print("=== Checking Bronze lakehouse tables ===")

# Check and load t3_prod
if table_exists("t3_prod"):
    print("  t3_prod: EXISTS (from copy job)")
else:
    print("  t3_prod: MISSING — loading from public blob storage...")
    try:
        df = spark.read.parquet(BLOB_T3_PROD)
        df.write.format("delta").mode("overwrite").saveAsTable("t3_prod")
        print(f"  t3_prod: LOADED ({df.count()} rows)")
    except Exception as e:
        print(f"  t3_prod: LOAD FAILED — {e}")
        traceback.print_exc()

# Check and load t2
if table_exists("t2"):
    print("  t2: EXISTS (from copy job)")
else:
    print("  t2: MISSING — loading from public blob storage...")
    try:
        df = spark.read.parquet(BLOB_T2)
        df.write.format("delta").mode("overwrite").saveAsTable("t2")
        print(f"  t2: LOADED ({df.count()} rows)")
    except Exception as e:
        print(f"  t2: LOAD FAILED — {e}")
        traceback.print_exc()

print("=== Data availability check complete ===")


In [None]:
# Create derivative Bronze tables from t3_prod
print("Reading t3_prod...")
t3_prod_df = spark.table("t3_prod")
prod_count = t3_prod_df.count()
print(f"t3_prod row count: {prod_count}")

print("Creating t3_dev (1% sample)...")
t3_prod_df.sample(fraction=0.01, seed=42).write.format("delta").mode("overwrite").saveAsTable("t3_dev")

print("Creating t3_test (10% sample)...")
t3_prod_df.sample(fraction=0.10, seed=123).write.format("delta").mode("overwrite").saveAsTable("t3_test")

print("Creating t3 from t3_dev...")
spark.table("t3_dev").write.format("delta").mode("overwrite").saveAsTable("t3")

print("All Bronze derivative tables created successfully!")


In [None]:
# Verify all table row counts
for tbl in ["t3_prod", "t3_dev", "t3_test", "t3", "t2"]:
    try:
        cnt = spark.table(tbl).count()
        print(f"  {tbl}: {cnt} rows")
    except Exception as e:
        print(f"  {tbl}: ERROR — {e}")
print("Bronze layer verification complete.")
