In [0]:

from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, trim, initcap, when, lit, year, month, dayofmonth, concat, concat_ws
import time

In [0]:
catalog_name = "retail_dev"
schema_silver = "silver"
schema_bronze = "bronze"
schema_auditoria ="auditoria"

In [0]:
silver_devoluciones = (
    spark.table(f"{catalog_name}.{schema_bronze}.devoluciones_raw")
    .withColumn("id_devolucion", F.col("id_devolucion").cast("bigint"))
    .withColumn("id_venta", F.col("id_venta").cast("bigint"))
    .withColumn("motivo", F.trim(F.col("motivo")))
    .withColumn(
        "fecha_devolucion",
        F.when(F.col("fecha_devolucion").rlike("^\d{4}-\d{2}-\d{2}$"),
               F.to_date("fecha_devolucion", "yyyy-MM-dd"))
         .when(F.col("fecha_devolucion").rlike("^\d{2}/\d{2}/\d{4}$"),
               F.to_date("fecha_devolucion", "dd/MM/yyyy"))
         .otherwise(F.lit(None))
    )
    .withColumn("updated_at", F.current_timestamp())
    .dropna(subset=["id_devolucion"])
    .dropDuplicates(["id_devolucion"])
)

In [0]:
display(silver_devoluciones)

In [0]:
spark.sql(f"""
   CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_silver}.devoluciones (
    id_devolucion BIGINT,
    id_venta BIGINT,
    motivo STRING,
    fecha_devolucion TIMESTAMP,
    updated_at TIMESTAMP
   ) USING DELTA
   """)

In [0]:
silver_devoluciones.createOrReplaceTempView("silver_devoluciones")
merge_start = time.time()
spark.sql(f"""
MERGE INTO {catalog_name}.{schema_silver}.devoluciones tgt
USING silver_devoluciones src
ON tgt.id_devolucion = src.id_devolucion
WHEN MATCHED AND src.updated_at > tgt.updated_at THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""")
merge_end = time.time()

In [0]:
spark.table(f"{catalog_name}.{schema_silver}.devoluciones").show(10)

In [0]:
import time
job_id = int(time.time())           # ID único basado en timestamp
job_run_id = int(time.time() * 100) # más granular
task_run_id = 1                     # o secuencial si prefieres


In [0]:
rows_in = silver_devoluciones.count()
start = time.time()
spark.sql(f"""
INSERT INTO {catalog_name}.{schema_auditoria}.ingestion_log VALUES (
  {job_id},{job_run_id},{task_run_id},
  current_timestamp(), current_timestamp(),
  {int(time.time()-start)},
  'SUCCESS','devoluciones','silver',
  {rows_in},{rows_in},0,0,0,{int(merge_end-merge_start)},current_timestamp()
)
""")


In [0]:
display(spark.table(f"{catalog_name}.{schema_auditoria}.ingestion_log"))
