In [0]:
## Imports
from pyspark.sql.functions import *
from pyspark.sql import DataFrame

In [0]:
## 2. Widgets
dbutils.widgets.removeAll()

dbutils.widgets.text("catalogName", "catalog_supermarket")
dbutils.widgets.text("schemaBronze", "bronze")
dbutils.widgets.text("schemaSilver", "silver")

print("catalogName :", dbutils.widgets.get("catalogName"))
print("schemaBronze:", dbutils.widgets.get("schemaBronze"))
print("schemaSilver:", dbutils.widgets.get("schemaSilver"))


In [0]:
##3. Constants
catalog = dbutils.widgets.get("catalogName")
bronze_schema = dbutils.widgets.get("schemaBronze")
silver_schema = dbutils.widgets.get("schemaSilver")


In [0]:
table_orders_bronze          = f"{catalog}.{bronze_schema}.orders"
table_op_prior_bronze        = f"{catalog}.{bronze_schema}.order_products_prior"
table_op_train_bronze        = f"{catalog}.{bronze_schema}.order_products_train"
table_order_products_silver  = f"{catalog}.{bronze_schema}.order_products"

In [0]:
##4. Read sources
df_orders_bronze = spark.table(table_orders_bronze)
df_op_prior      = spark.table(table_op_prior_bronze)
df_op_train      = spark.table(table_op_train_bronze)

# Union prior + train
df_op_all = df_op_prior.unionByName(df_op_train)

print("order_products_all (prior + train)")
display(df_op_all.limit(100))


In [0]:
##5. Transform â€“ Product Hierarchy
orders = df_orders_bronze.alias("o")
op     = df_op_all.alias("op")

df_order_products_silver = (
    op.join(orders, on="order_id", how="inner")
      .select(
          col("o.order_id").alias("order_id"),
          col("o.user_id").alias("user_id"),
          col("o.eval_set").alias("eval_set"),
          col("o.order_number").alias("order_number"),
          col("o.order_dow").alias("order_dow"),
          col("o.order_hour_of_day").alias("order_hour_of_day"),
          col("o.days_since_prior_order").alias("days_since_prior_order"),
          col("op.product_id").alias("product_id"),
          col("op.add_to_cart_order").alias("add_to_cart_order"),
          col("op.reordered").alias("reordered")
      )
)

df_order_products_silver.printSchema()
display(df_order_products_silver.limit(20))


In [0]:
##Save
(
    df_order_products_silver
        .write
        .mode("overwrite")
        .saveAsTable(table_order_products_silver)
)


In [0]:
print(f"Tabla creada/actualizada: {table_order_products_silver}")
display(spark.table(table_order_products_silver).limit(20))