In [0]:
# ============================================
# GOLD INGESTA - NORTHWIND
# ============================================

from pyspark.sql.functions import col

# --------------------------------------------
# CONFIGURACIÓN
# --------------------------------------------
catalog = "northwind"
silver_schema = "silver"
gold_schema = "gold"

spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {gold_schema}")

# --------------------------------------------
# FUNCIONES UTILES
# --------------------------------------------
def save_gold(df, table):
    df.write.mode("overwrite").format("delta").saveAsTable(f"{gold_schema}.{table}")
    print(f"✅ Gold Table creada: {gold_schema}.{table}")

# --------------------------------------------
# 1️⃣ DIMENSIONES
# --------------------------------------------

print("⭐ Creando Dimensiones...")

# Dim Products
dim_products = spark.table(f"{silver_schema}.t_products") \
    .select(
        "product_id", "product_name", "supplier_id", "category_id",
        "quantity_per_unit", "unit_price", "discontinued",
        "ingest_date", "source_file", "load_id"
    )
save_gold(dim_products, "dim_products")

# Dim Customers
dim_customers = spark.table(f"{silver_schema}.t_customers") \
    .select("*")
save_gold(dim_customers, "dim_customers")

# Dim Employees
dim_employees = spark.table(f"{silver_schema}.t_employees") \
    .select("*")
save_gold(dim_employees, "dim_employees")

# Dim Shippers
dim_shippers = spark.table(f"{silver_schema}.t_shippers") \
    .select("*")
save_gold(dim_shippers, "dim_shippers")

# Dim Suppliers
dim_suppliers = spark.table(f"{silver_schema}.t_suppliers") \
    .select("*")
save_gold(dim_suppliers, "dim_suppliers")

# Dim Categories
dim_categories = spark.table(f"{silver_schema}.t_categories") \
    .select("*")
save_gold(dim_categories, "dim_categories")

print("✅ Dimensiones completadas ✅")

# --------------------------------------------
# 2️⃣ FACT TABLE - fact_orders
# --------------------------------------------

print("⭐ Creando Fact Table: fact_orders...")

# Leer órdenes y detalles
orders = spark.table(f"{silver_schema}.t_orders") \
    .withColumnRenamed("ingest_date", "ingest_date_order") \
    .withColumnRenamed("source_file", "source_file_order") \
    .withColumnRenamed("load_id", "load_id_order")

order_details = spark.table(f"{silver_schema}.t_order_details") \
    .withColumnRenamed("ingest_date", "ingest_date_order_detail") \
    .withColumnRenamed("source_file", "source_file_order_detail") \
    .withColumnRenamed("load_id", "load_id_order_detail")

# Join entre órdenes y detalles
fact_orders = order_details.join(
    orders,
    on="order_id",
    how="inner"
)

# Seleccionar columnas relevantes
fact_orders = fact_orders.select(
    "order_id",
    "order_date",
    "required_date",
    "shipped_date",
    "customer_id",
    "employee_id",
    "product_id",
    "unit_price",
    "quantity",
    "discount",
    "freight",
    "ship_via",
    "ingest_date_order",
    "ingest_date_order_detail",
    "source_file_order",
    "source_file_order_detail",
    "load_id_order",
    "load_id_order_detail"
)

save_gold(fact_orders, "fact_orders")

print("🚀 Ingesta GOLD completa ✅")


In [0]:
# ============================================
# GOLD - Crear Fact Wide Table
# ============================================

from pyspark.sql.functions import concat_ws, col

# --------------------------------------------
# CONFIGURACIÓN
# --------------------------------------------
catalog = "northwind"
gold_schema = "gold"

spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {gold_schema}")

# --------------------------------------------
# LEER TABLAS GOLD
# --------------------------------------------
fact_orders = spark.table(f"{gold_schema}.fact_orders")
dim_customers = spark.table(f"{gold_schema}.dim_customers")
dim_employees = spark.table(f"{gold_schema}.dim_employees")
dim_products = spark.table(f"{gold_schema}.dim_products")
dim_categories = spark.table(f"{gold_schema}.dim_categories")
dim_suppliers = spark.table(f"{gold_schema}.dim_suppliers")
dim_shippers = spark.table(f"{gold_schema}.dim_shippers")

# --------------------------------------------
# ENRIQUECER FACT TABLE CON JOINs
# --------------------------------------------

# 1️⃣ Join con Customers
df = fact_orders.join(
    dim_customers.select("customer_id", "company_name"),
    on="customer_id",
    how="left"
).withColumnRenamed("company_name", "customer_company_name")

# 2️⃣ Join con Employees
df = df.join(
    dim_employees.select("employee_id", "first_name", "last_name"),
    on="employee_id",
    how="left"
)
df = df.withColumn("employee_full_name", concat_ws(" ", col("first_name"), col("last_name")))
df = df.drop("first_name", "last_name")

# 3️⃣ Join con Products
df = df.join(
    dim_products.select("product_id", "product_name", "category_id", "supplier_id"),
    on="product_id",
    how="left"
)

# 4️⃣ Join con Categories
df = df.join(
    dim_categories.select("category_id", "category_name"),
    on="category_id",
    how="left"
)

# 5️⃣ Join con Suppliers
df = df.join(
    dim_suppliers.select("supplier_id", "company_name"),
    on="supplier_id",
    how="left"
).withColumnRenamed("company_name", "supplier_company_name")

df = df.withColumnRenamed("ship_via", "shipper_id")
# 6️⃣ Join con Shippers
df = df.join(
    dim_shippers.select("shipper_id", "company_name"),
    on="shipper_id",
    how="left"
).withColumnRenamed("company_name", "shipper_company_name")

# --------------------------------------------
# SELECCIONAR COLUMNAS FINALES
# --------------------------------------------
final_cols = [
    "order_id",
    "order_date",
    "required_date",
    "shipped_date",
    "customer_id",
    "customer_company_name",
    "employee_id",
    "employee_full_name",
    "product_id",
    "product_name",
    "category_id",
    "category_name",
    "supplier_id",
    "supplier_company_name",
    "unit_price",
    "quantity",
    "discount",
    "freight",
    "shipper_id",
    "shipper_company_name",
    # Audit columns
    "ingest_date_order",
    "ingest_date_order_detail",
    "source_file_order",
    "source_file_order_detail",
    "load_id_order",
    "load_id_order_detail"
]

df = df.select(final_cols)

# --------------------------------------------
# GUARDAR EN GOLD
# --------------------------------------------
df.write.mode("overwrite").format("delta").saveAsTable(f"{gold_schema}.fact_orders_wide")

print("✅ Wide Table creada en GOLD: fact_orders_wide 🚀")
