In [0]:
catalog_name = dbutils.widgets.text("catalog_name", "")
catalog_name = dbutils.widgets.get("catalog_name")
bronze_schema = dbutils.widgets.text("bronze_schema", "")
bronze_schema = dbutils.widgets.get("bronze_schema")
silver_schema = dbutils.widgets.text("silver_schema", "")
silver_schema = dbutils.widgets.get("silver_schema")
gold_schema = dbutils.widgets.text("gold_schema", "")
gold_schema = dbutils.widgets.get("gold_schema")
input_table_producto = dbutils.widgets.text("input_table_producto", "")
input_table_producto = dbutils.widgets.get("input_table_producto")
output_table_producto = dbutils.widgets.text("output_table_producto", "")
output_table_producto = dbutils.widgets.get("output_table_producto")

In [0]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, trim, initcap, when, lit, year, month, dayofmonth, concat, concat_ws
from delta.tables import DeltaTable
from pyspark.sql.utils import AnalysisException
import time

In [0]:
schema_auditoria ="auditoria"

In [0]:
gold_dim_producto = (
    spark.table(f"{catalog_name}.{silver_schema}.{input_table_producto}")
    .select("id_producto", "nombre", "categoria", "precio")
    .withColumn("creation_date", F.current_timestamp())
    .dropDuplicates(["id_producto"])
)



In [0]:
display(gold_dim_producto)

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{gold_schema}.{output_table_producto} (
  id_producto BIGINT,
  nombre STRING,
  categoria STRING,
  precio DECIMAL(18,2),
  creation_date TIMESTAMP
) USING DELTA
""")

In [0]:

delta_target = DeltaTable.forName(spark, f"{catalog_name}.{gold_schema}.{output_table_producto}")

merge_start = time.time()
delta_target.alias("tgt") \
  .merge(
    source=gold_dim_producto.alias("src"),
    condition="tgt.id_producto = src.id_producto"
  ) \
  .whenMatchedUpdateAll(
    condition="src.creation_date > tgt.creation_date"
  ) \
  .whenNotMatchedInsertAll() \
  .execute()
merge_end = time.time()

In [0]:
spark.table(f"{catalog_name}.{gold_schema}.{output_table_producto}").show(10)

In [0]:
import time
job_id = int(time.time())           # ID único basado en timestamp
job_run_id = int(time.time() * 100) # más granular
task_run_id = 1                     # o secuencial si prefieres


In [0]:
rows_in = gold_dim_producto.count()
start = time.time()
spark.sql(f"""
INSERT INTO {catalog_name}.{schema_auditoria}.ingestion_log VALUES (
  {job_id},{job_run_id},{task_run_id},
  current_timestamp(), current_timestamp(),
  {int(time.time()-start)},
  'SUCCESS','dim_producto','gold',
  {rows_in},{rows_in},0,0,0,{int(merge_end-merge_start)},current_timestamp()
)
""")

In [0]:
display(spark.table(f"{catalog_name}.{schema_auditoria}.ingestion_log"))


In [0]:

sql_command = f"""
  OPTIMIZE {catalog_name}.{gold_schema}.dim_productos
  ZORDER BY (id_producto)
"""

In [0]:
spark.sql(sql_command)


In [0]:
spark.sql(f"VACUUM {catalog_name}.{gold_schema}.dim_productos")