In [0]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, trim, initcap, when, lit, year, month, dayofmonth, concat, concat_ws
import time

In [0]:
catalog_name = "retail_dev"
schema_bronze = "bronze"
schema_silver = "silver"
schema_gold = "gold"
schema_auditoria ="auditoria"

In [0]:
dim_tiempo = spark.table(f"{catalog_name}.{schema_gold}.dim_tiempo")
silver_ventas = spark.table(f"{catalog_name}.{schema_silver}.ventas")
silver_devolucion = spark.table(f"{catalog_name}.{schema_silver}.devoluciones")

gold_fact_ventas = (
    silver_ventas.alias("v")
    .join(silver_devolucion.alias("d"), "id_venta", "left")
    .join(dim_tiempo.alias("t"), F.to_date("v.fecha_venta") == F.col("t.fecha"), "left")
    .withColumn("total_venta", F.col("v.cantidad") * F.col("v.monto"))
    .withColumn(
        "estado",
        F.when(F.col("d.fecha_devolucion").isNotNull(), "devuelto")
         .otherwise("vendido")
    )
    .withColumn("creation_date", F.current_timestamp())
    #.dropDuplicates(["id_producto"])
    .select(
        "v.id_venta", "v.id_cliente", "v.id_tienda", "v.id_producto","t.id_tiempo", "v.fecha_venta",
        "v.cantidad", "v.monto", "total_venta","estado","d.fecha_devolucion","creation_date"
    )
    .dropna(subset=["id_venta", "id_producto"])
    .dropDuplicates(["id_venta", "id_producto"])
)



In [0]:
display(gold_fact_ventas)

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_gold}.fact_ventas (
  id_venta BIGINT,
  id_cliente BIGINT,
  id_tienda BIGINT,  
  id_producto BIGINT,  
  id_tiempo BIGINT,
  fecha_venta TIMESTAMP,
  cantidad INT,
  monto DECIMAL(18,2),
  total_venta DECIMAL(29,2),
  estado STRING,
  fecha_devolucion TIMESTAMP,
  creation_date TIMESTAMP
) USING DELTA
""")

In [0]:
gold_fact_ventas.createOrReplaceTempView("gold_fact_ventas")
merge_start = time.time()
spark.sql(f"""
MERGE INTO {catalog_name}.{schema_gold}.fact_ventas tgt
USING gold_fact_ventas src
ON tgt.id_venta = src.id_venta and tgt.id_producto = src.id_producto
WHEN MATCHED AND src.creation_date > tgt.creation_date THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""")
merge_end = time.time()

In [0]:
spark.table(f"{catalog_name}.{schema_gold}.fact_ventas").show(10)

In [0]:
import time
job_id = int(time.time())           # ID único basado en timestamp
job_run_id = int(time.time() * 100) # más granular
task_run_id = 1                     # o secuencial si prefieres


In [0]:
rows_in = gold_fact_ventas.count()
start = time.time()
spark.sql(f"""
INSERT INTO {catalog_name}.{schema_auditoria}.ingestion_log VALUES (
  {job_id},{job_run_id},{task_run_id},
  current_timestamp(), current_timestamp(),
  {int(time.time()-start)},
  'SUCCESS','fact_ventas','gold',
  {rows_in},{rows_in},0,0,0,{int(merge_end-merge_start)},current_timestamp()
)
""")

In [0]:
display(spark.table(f"{catalog_name}.{schema_auditoria}.ingestion_log"))


In [0]:

sql_command = f"""
  OPTIMIZE {catalog_name}.{schema_gold}.fact_ventas
  ZORDER BY (id_venta, id_producto)
"""

In [0]:
spark.sql(sql_command)

In [0]:
spark.sql(f"VACUUM {catalog_name}.{schema_gold}.dim_tiendas")