In [0]:
## Imports
from pyspark.sql.functions import *
from pyspark.sql.types import *


In [0]:
## 2. Widgets
dbutils.widgets.removeAll()

dbutils.widgets.text("catalogName", "catalog_supermarket")
dbutils.widgets.text("schemaSilver", "silver")

print("catalogName:", dbutils.widgets.get("catalogName"))
print("schemaSilver:", dbutils.widgets.get("schemaSilver"))

In [0]:
##3. Constants
catalog_name = dbutils.widgets.get("catalogName")
schema_silver = dbutils.widgets.get("schemaSilver")


In [0]:
##4. Read sources
# 4. Read sources

# Nombres de tablas en Silver
order_products_prior_table = f"{catalog_name}.{schema_silver}.order_products_prior"
order_products_train_table = f"{catalog_name}.{schema_silver}.order_products_train"
product_hierarchy_table    = f"{catalog_name}.{schema_silver}.product_hierarchy"

print("Tabla order_products_prior (silver):", order_products_prior_table)
print("Tabla order_products_train (silver):", order_products_train_table)
print("Tabla product_hierarchy (silver)   :", product_hierarchy_table)

# Unificamos PRIOR + TRAIN
df_order_products = (
    spark.table(order_products_prior_table)
         .unionByName(
             spark.table(order_products_train_table)
         )
)

df_product_hierarchy = spark.table(product_hierarchy_table)

print("⚙ Preview df_order_products (silver unificada):")
display(df_order_products.limit(5))

print("⚙ Preview df_product_hierarchy:")
display(df_product_hierarchy.limit(5))




In [0]:

print("Ordenes-producto (silver):")
display(df_order_products.limit(5))

print("Jerarquía de productos (silver):")
display(df_product_hierarchy.limit(5))

In [0]:
##5. Transform – Product Hierarchy
op = df_order_products.alias("op")
ph = df_product_hierarchy.alias("ph")

df_order_details = (
    op.join(ph, on="product_id", how="left")
      .select(
          # Datos de la orden / usuario
          col("op.order_id"),
          col("op.user_id"),
          col("op.eval_set"),
          col("op.order_number"),
          col("op.order_dow"),
          col("op.order_hour_of_day"),
          col("op.days_since_prior_order"),
          
          # Datos de producto + jerarquía
          col("op.product_id"),
          col("ph.product_name"),
          col("ph.aisle_id"),
          col("ph.aisle"),
          col("ph.department_id"),
          col("ph.department"),
          
          # Datos de línea de carrito
          col("op.add_to_cart_order"),
          col("op.reordered")
      )
)

df_order_details.printSchema()
display(df_order_details.limit(20))


In [0]:
##Save
target_table = f"{catalog_name}.{schema_silver}.order_details"

(df_order_details
    .write
    .mode("overwrite")
    .saveAsTable(target_table)
)

print(f"Tabla creada/actualizada: {target_table}")
display(spark.table(target_table).limit(20))
