In [0]:
# Databricks notebook source


# COMMAND ----------

# Import necessary libraries
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType, LongType
import pandas as pd
import uuid
from datetime import datetime

# COMMAND ----------

# DBTITLE 1,Step 1: Setup - Create ALL Mock Bronze Tables in a Temporary Schema
# Create a unique schema name to avoid conflicts
test_run_id = str(uuid.uuid4()).replace("-", "_")
temp_catalog = "hive_metastore" 
temp_schema = f"silver_test_workspace_{test_run_id}"

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {temp_catalog}.{temp_schema}")
print(f"Temporary test schema created: {temp_catalog}.{temp_schema}")

# --- Define all necessary schemas ---
dist_centers_schema = StructType([StructField("id", IntegerType()), StructField("name", StringType()), StructField("latitude", DoubleType()), StructField("longitude", DoubleType())])
events_schema = StructType([StructField("id", IntegerType()), StructField("user_id", IntegerType()), StructField("sequence_number", LongType()), StructField("session_id", StringType()), StructField("created_at", TimestampType()), StructField("ip_address", StringType()), StructField("city", StringType()), StructField("state", StringType()), StructField("postal_code", StringType()), StructField("browser", StringType()), StructField("traffic_source", StringType()), StructField("uri", StringType()), StructField("event_type", StringType())])
inventory_items_schema = StructType([StructField("id", IntegerType()), StructField("product_id", IntegerType()), StructField("created_at", TimestampType()), StructField("sold_at", TimestampType()), StructField("cost", DoubleType()), StructField("product_category", StringType())])
order_items_schema = StructType([StructField("id", IntegerType()), StructField("order_id", IntegerType()), StructField("user_id", IntegerType()), StructField("product_id", IntegerType()), StructField("inventory_item_id", IntegerType()), StructField("status", StringType()), StructField("created_at", TimestampType()), StructField("shipped_at", TimestampType()), StructField("delivered_at", TimestampType()), StructField("returned_at", TimestampType()), StructField("sale_price", DoubleType())])
orders_schema = StructType([StructField("order_id", IntegerType()), StructField("user_id", IntegerType()), StructField("status", StringType()), StructField("gender", StringType()), StructField("created_at", TimestampType()), StructField("returned_at", TimestampType()), StructField("shipped_at", TimestampType()), StructField("delivered_at", TimestampType()), StructField("num_of_item", IntegerType())])
products_schema = StructType([StructField("id", IntegerType()), StructField("cost", DoubleType()), StructField("category", StringType())])
users_schema = StructType([StructField("id", IntegerType()), StructField("first_name", StringType()), StructField("last_name", StringType()), StructField("email", StringType()), StructField("age", IntegerType()), StructField("gender", StringType()), StructField("state", StringType()), StructField("street_address", StringType()), StructField("postal_code", StringType()), StructField("city", StringType()), StructField("country", StringType()), StructField("latitude", DoubleType()), StructField("longitude", DoubleType()), StructField("traffic_source", StringType()), StructField("created_at", TimestampType())])

# --- Create mock tables WITH DATA for testing ---
dist_centers_data = [(1, "Center A ", 34.05, -118.24), (2, "Center B", None, -73.93), (3, None, 40.71, -74.00), (1, "Center A Duplicate", 34.05, -118.24)]
spark.createDataFrame(dist_centers_data, dist_centers_schema).write.mode("overwrite").saveAsTable(f"{temp_catalog}.{temp_schema}.distribution_centers")

users_data = [(101, " John ", "Doe", "j.doe@email.com", 30, "Male", "CA", None, "90210", "LA", "USA", 34.05, -118.24, "google", datetime(2023, 1, 1)), (102, "Jane", None, "jane@email.com", 25, "Female", "NY", "123 Main", "10001", "NYC", "USA", 40.71, -74.00, "facebook", None), (101, "John", "Doe", "j.doe2@email.com", 31, "Male", "CA", "456 Oak", "90210", "LA", "USA", 34.05, -118.24, "google", datetime(2023, 1, 2))]
spark.createDataFrame(users_data, users_schema).write.mode("overwrite").saveAsTable(f"{temp_catalog}.{temp_schema}.users")

orders_data = [(201, 101, "Shipped ", "Male", datetime(2023, 1, 1), None, None, None, 2), (202, 102, "Processing", "Female", datetime(2023, 1, 2), None, None, None, 1), (201, 101, "Delivered", "Male", datetime(2023, 1, 1), None, None, None, 2)]
spark.createDataFrame(orders_data, orders_schema).write.mode("overwrite").saveAsTable(f"{temp_catalog}.{temp_schema}.orders")

products_data = [(301, 10.50, "Books "), (302, None, "Electronics"), (301, 12.00, "Books")]
spark.createDataFrame(products_data, products_schema).write.mode("overwrite").saveAsTable(f"{temp_catalog}.{temp_schema}.products")

# --- Create other required tables as EMPTY tables to allow the notebook to run ---
spark.createDataFrame([], events_schema).write.mode("overwrite").saveAsTable(f"{temp_catalog}.{temp_schema}.events")
spark.createDataFrame([], inventory_items_schema).write.mode("overwrite").saveAsTable(f"{temp_catalog}.{temp_schema}.inventory_items")
spark.createDataFrame([], order_items_schema).write.mode("overwrite").saveAsTable(f"{temp_catalog}.{temp_schema}.order_items")

print("All mock Bronze tables created.")

# COMMAND ----------

# DBTITLE 2,Step 2: Define Expected Silver Outputs
# --- Expected output for 'distribution_centers' ---
expected_dist_centers_schema = dist_centers_schema
expected_dist_centers_data1 = [(1, "Center A", 34.05, -118.24), (2, "Center B", 0.0, -73.93), (3, "Unknown", 40.71, -74.00)]
expected_dist_centers_data2 = [(1, "Center A Duplicate", 34.05, -118.24), (2, "Center B", 0.0, -73.93), (3, "Unknown", 40.71, -74.00)]
expected_dist_centers_df1 = spark.createDataFrame(expected_dist_centers_data1, expected_dist_centers_schema)
expected_dist_centers_df2 = spark.createDataFrame(expected_dist_centers_data2, expected_dist_centers_schema)

# --- Expected output for 'users' ---
expected_users_schema = users_schema
expected_users_data = [(101, "John", "Doe", "j.doe@email.com", 30, "Male", "CA", "Unknown", "90210", "LA", "USA", 34.05, -118.24, "google", datetime(2023, 1, 1)), (102, "Jane", "Unknown", "jane@email.com", 25, "Female", "NY", "123 Main", "10001", "NYC", "USA", 40.71, -74.00, "facebook", None)]
expected_users_df = spark.createDataFrame(expected_users_data, expected_users_schema)

# --- Expected output for 'orders' ---
expected_orders_schema = orders_schema
expected_orders_data = [(201, 101, "Shipped", "Male", datetime(2023, 1, 1), None, None, None, 2), (202, 102, "Processing", "Female", datetime(2023, 1, 2), None, None, None, 1)]
expected_orders_df = spark.createDataFrame(expected_orders_data, expected_orders_schema)

# --- Expected output for 'products' ---
expected_products_schema = products_schema
expected_products_data = [(301, 10.50, "Books"), (302, 0.0, "Electronics")]
expected_products_df = spark.createDataFrame(expected_products_data, expected_products_schema)

print("All Expected Silver DataFrames defined.")

# COMMAND ----------

# DBTITLE 3,Step 3: Execute the Main Notebook
test_results = {}
try:
    print("Executing the main Silver Layer notebook...")
    # !!! IMPORTANT !!! 
    # Replace the path below with the correct path to your main Silver Layer notebook.
    notebook_to_test = "/Workspace/final_project/project/silver2" 
    
    params = {
        "source_catalog": temp_catalog, "source_schema": temp_schema,
        "target_catalog": temp_catalog, "target_schema": temp_schema 
    }
    
    dbutils.notebook.run(notebook_to_test, 600, params)
    print("Main notebook execution complete.")

    # COMMAND ----------
    
    # DBTITLE 4,Step 4: Assert Results for Each Table
    
    # --- Assert 'distribution_centers' ---
    try:
        print("\n--- ASSERTING 'distribution_centers' ---")
        actual_df = spark.table(f"{temp_catalog}.{temp_schema}.distribution_centers")
        actual_pd = actual_df.sort("id").toPandas()
        expected_pd1 = expected_dist_centers_df1.sort("id").toPandas()
        expected_pd2 = expected_dist_centers_df2.sort("id").toPandas()
        if not (actual_pd.equals(expected_pd1) or actual_pd.equals(expected_pd2)):
            raise AssertionError("Actual output did not match expected outputs.")
        print("✅ TEST PASSED: 'distribution_centers'")
        test_results['distribution_centers'] = 'PASSED'
    except Exception as e:
        print(f"❌ TEST FAILED: 'distribution_centers'"); test_results['distribution_centers'] = f'FAILED: {e}'

    # --- Assert 'users' ---
    try:
        print("\n--- ASSERTING 'users' ---")
        actual_df = spark.table(f"{temp_catalog}.{temp_schema}.users")
        assert actual_df.filter("id = 102").select("created_at").first()[0] is not None, "Timestamp for user 102 should be filled"
        actual_pd = actual_df.drop("created_at").sort("id").toPandas()
        expected_pd = expected_users_df.drop("created_at").sort("id").toPandas()
        pd.testing.assert_frame_equal(actual_pd, expected_pd)
        print("✅ TEST PASSED: 'users'"); test_results['users'] = 'PASSED'
    except Exception as e:
        print(f"❌ TEST FAILED: 'users'"); test_results['users'] = f'FAILED: {e}'

    # --- Assert 'orders' ---
    try:
        print("\n--- ASSERTING 'orders' ---")
        actual_df = spark.table(f"{temp_catalog}.{temp_schema}.orders")
        
        # New, more specific assertions for timestamps
        filled_timestamps = actual_df.filter("order_id = 201").select("returned_at", "shipped_at", "delivered_at").first()
        assert filled_timestamps.returned_at is not None, "The 'returned_at' column for order 201 should have been filled."
        assert filled_timestamps.shipped_at is not None, "The 'shipped_at' column for order 201 should have been filled."
        assert filled_timestamps.delivered_at is not None, "The 'delivered_at' column for order 201 should have been filled."

        # Compare the rest of the data after dropping non-deterministic columns
        actual_pd = actual_df.drop("returned_at", "shipped_at", "delivered_at").sort("order_id").toPandas()
        expected_pd = expected_orders_df.drop("returned_at", "shipped_at", "delivered_at").sort("order_id").toPandas()
        pd.testing.assert_frame_equal(actual_pd, expected_pd)
        print("✅ TEST PASSED: 'orders'"); test_results['orders'] = 'PASSED'
    except Exception as e:
        print(f"❌ TEST FAILED: 'orders'"); test_results['orders'] = f'FAILED: {e}'

    # --- Assert 'products' ---
    try:
        print("\n--- ASSERTING 'products' ---")
        actual_df = spark.table(f"{temp_catalog}.{temp_schema}.products")
        actual_pd = actual_df.sort("id").toPandas()
        expected_pd = expected_products_df.sort("id").toPandas()
        pd.testing.assert_frame_equal(actual_pd, expected_pd)
        print("✅ TEST PASSED: 'products'"); test_results['products'] = 'PASSED'
    except Exception as e:
        print(f"❌ TEST FAILED: 'products'"); test_results['products'] = f'FAILED: {e}'
        
except Exception as e:
    print(f"❌ TEST FAILED: An error occurred during the main notebook run.")
    print("\n----- ERROR DETAILS -----"); print(e)

finally:
    # --- CLEANUP ---
    print("\n--- FINAL TEST SUMMARY ---")
    for table, result in test_results.items():
        print(f"- {table}: {result}")
    
    print(f"\nCleaning up: Dropping temporary schema '{temp_catalog}.{temp_schema}'...")
    spark.sql(f"DROP SCHEMA IF EXISTS {temp_catalog}.{temp_schema} CASCADE")
    print("Cleanup complete.")
