In [0]:

from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, trim, initcap, when, lit, year, month, dayofmonth, concat, concat_ws
import time
from delta.tables import DeltaTable

In [0]:
%sql
-- Ejecuta esto en una celda SQL para vaciar tus tablas
TRUNCATE TABLE retail_dev.silver.ventas;
TRUNCATE TABLE retail_dev.bronze.ventas_base_raw;
TRUNCATE TABLE retail_dev.bronze.ventas_incremento_raw;

In [0]:
# Valida la tabla Bronze después de la ingesta
display(
    spark.table(f"{catalog_name}.{schema_bronze}.ventas_incremento_raw")
        .groupBy("id_venta", "id_producto")
        .count()
        .where("count > 1")
)

In [0]:
# Valida la tabla Bronze después de la ingesta
display(
    spark.table(f"{catalog_name}.{schema_bronze}.ventas_base_raw")
        .groupBy("id_venta", "id_producto")
        .count()
        .where("count > 1")
)

In [0]:
silver_ventas_base = (
    spark.table(f"{catalog_name}.{schema_bronze}.ventas_base_raw")
    .withColumn("id_venta", F.col("id_venta").cast("bigint"))
    .withColumn("id_cliente", F.col("id_cliente").cast("bigint"))
    .withColumn("id_tienda", F.col("id_tienda").cast("bigint"))
    .withColumn("id_producto", F.col("id_producto").cast("bigint"))
    .withColumn("cantidad",
        F.when(F.col("cantidad").rlike("^[0-9]+$"), F.col("cantidad").cast("int")) #"15"=15  "diez"=null
         .otherwise(F.lit(None))
    )
    .withColumn("monto", F.expr("try_cast(monto as decimal(18,2))"))
    .withColumn(
        "fecha_venta",
        F.when(F.col("fecha_venta").rlike("^\d{4}-\d{2}-\d{2}$"),
               F.to_date("fecha_venta", "yyyy-MM-dd"))
         .when(F.col("fecha_venta").rlike("^\d{2}/\d{2}/\d{4}$"),
               F.to_date("fecha_venta", "dd/MM/yyyy"))
         .otherwise(F.lit(None))
    )
    .withColumn("updated_at", F.to_timestamp("updated_at"))
    .withColumn("anio_mes", F.date_format("fecha_venta","yyyy-MM"))
    .withColumn("updated_at", F.current_timestamp())
    .dropna(subset=["id_venta", "id_producto"])
    .dropDuplicates(["id_venta", "id_producto"])
)

In [0]:
display(silver_ventas_base)

In [0]:
duplicados = (
    silver_ventas_base
    .groupBy("id_venta", "id_producto")
    .count()
    .where(F.col("count") > 1)
)
display(duplicados)

In [0]:
silver_ventas_incremento = (
    spark.table(f"{catalog_name}.{schema_bronze}.ventas_incremento_raw")
    .withColumn("id_venta", F.col("id_venta").cast("bigint"))
    .withColumn("id_cliente", F.col("id_cliente").cast("bigint"))
    .withColumn("id_tienda", F.col("id_tienda").cast("bigint"))
    .withColumn("id_producto", F.col("id_producto").cast("bigint"))
    .withColumn("cantidad",
        F.when(F.col("cantidad").rlike("^[0-9]+$"), F.col("cantidad").cast("int")) #"15"=15  "diez"=null
         .otherwise(F.lit(None))
    )
    .withColumn("monto", F.expr("try_cast(monto as decimal(18,2))"))
    .withColumn(
        "fecha_venta",
        F.when(F.col("fecha_venta").rlike("^\d{4}-\d{2}-\d{2}$"),
               F.to_date("fecha_venta", "yyyy-MM-dd"))
         .when(F.col("fecha_venta").rlike("^\d{2}/\d{2}/\d{4}$"),
               F.to_date("fecha_venta", "dd/MM/yyyy"))
         .otherwise(F.lit(None))
    )
    .withColumn("updated_at", F.to_timestamp("updated_at"))
    .withColumn("anio_mes", F.date_format("fecha_venta","yyyy-MM"))
    .withColumn("updated_at", F.current_timestamp())
    .dropna(subset=["id_venta", "id_producto"])
    .dropDuplicates(["id_venta", "id_producto"])
)

In [0]:
display(silver_ventas_incremento)

In [0]:
duplicados = (
    silver_ventas_incremento
    .groupBy("id_venta", "id_producto")
    .count()
    .where(F.col("count") > 1)
)
display(duplicados)

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_silver}.ventas (
  id_venta BIGINT,
  id_cliente BIGINT,
  id_tienda BIGINT,
  id_producto BIGINT,
  cantidad INT,
  monto DECIMAL(18,2),
  fecha_venta TIMESTAMP,
  updated_at TIMESTAMP,
  anio_mes STRING
) USING DELTA
""")

In [0]:
merge_start = time.time()
# 1. Unificar los DataFrames base e incremental
ventas_unificadas = silver_ventas_base.unionByName(silver_ventas_incremento)

# 2. Definir la ventana para encontrar los registros más recientes
# La partición se hace por la llave de negocio (id_venta, id_producto)
window_spec = Window.partitionBy("id_venta", "id_producto").orderBy(F.col("updated_at").desc())

# 3. Deduplicar el DataFrame unificado
# Se añade un número de fila y se filtra para quedarse solo con el más reciente (rn=1)
ventas_src_deduplicado = (
    ventas_unificadas
    .withColumn("rn", F.row_number().over(window_spec))
    .where(F.col("rn") == 1)
    .drop("rn")
)

# 4. Preparar la tabla Delta de destino
delta_target = DeltaTable.forName(spark, f"{catalog_name}.{schema_silver}.ventas")

# 5. Ejecutar la operación MERGE

(
    delta_target.alias("target")
    .merge(
        source=ventas_src_deduplicado.alias("source"),
        condition="target.id_venta = source.id_venta AND target.id_producto = source.id_producto"
    )
    .whenMatchedUpdate(
        set={
            "id_cliente": "source.id_cliente",
            "id_tienda": "source.id_tienda",
            "cantidad": "source.cantidad",
            "monto": "source.monto",
            "fecha_venta": "source.fecha_venta"
        }
    )
    .whenNotMatchedInsert(
        values={
            "id_venta": "source.id_venta",
            "id_cliente": "source.id_cliente",
            "id_tienda": "source.id_tienda",
            "id_producto": "source.id_producto",
            "cantidad": "source.cantidad",
            "monto": "source.monto",
            "fecha_venta": "source.fecha_venta",
            "updated_at": "source.updated_at"
        }
    )
    .execute()
)
merge_end = time.time()

In [0]:
spark.table(f"{catalog_name}.{schema_silver}.ventas").show(10)

In [0]:
spark.sql(f"""
SELECT id_venta, id_producto, COUNT(*) AS veces
FROM {catalog_name}.{schema_silver}.ventas
GROUP BY id_venta, id_producto
HAVING COUNT(*) > 1
""").show(20, truncate=False)


In [0]:
import time
job_id = int(time.time())           # ID único basado en timestamp
job_run_id = int(time.time() * 100) # más granular
task_run_id = 1                     # o secuencial si prefieres


In [0]:
rows_in = spark.table(f"{catalog_name}.{schema_silver}.ventas").count()
merge_end = time.time()
start = time.time()
spark.sql(f"""
INSERT INTO {catalog_name}.{schema_auditoria}.ingestion_log VALUES (
  {job_id},{job_run_id},{task_run_id},
  current_timestamp(), current_timestamp(),
  {int(time.time()-start)},
  'SUCCESS','ventas','silver',
  {rows_in},{rows_in},0,0,0,{int(merge_end-merge_start)},current_timestamp()
)
""")

In [0]:
display(spark.table(f"{catalog_name}.{schema_auditoria}.ingestion_log"))
