In [0]:
catalog_name = dbutils.widgets.text("catalog_name", "")
catalog_name = dbutils.widgets.get("catalog_name")
bronze_schema = dbutils.widgets.text("bronze_schema", "")
bronze_schema = dbutils.widgets.get("bronze_schema")
silver_schema = dbutils.widgets.text("silver_schema", "")
silver_schema = dbutils.widgets.get("silver_schema")
input_table_name = dbutils.widgets.text("input_table_name", "")
input_table_name = dbutils.widgets.get("input_table_name")
output_table_name = dbutils.widgets.text("output_table_name", "")
output_table_name = dbutils.widgets.get("output_table_name")

In [0]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, trim, initcap, when, lit, year, month, dayofmonth, concat, concat_ws
from delta.tables import DeltaTable
from pyspark.sql.utils import AnalysisException
import time

In [0]:
schema_auditoria ="auditoria"

In [0]:
sql_catalog = f"""
    CREATE CATALOG IF NOT EXISTS {catalog_name}
"""

spark.sql(sql_catalog)

In [0]:
sql_silver = f"""
    CREATE SCHEMA IF NOT EXISTS {catalog_name}.{silver_schema}
"""

spark.sql(sql_silver)

In [0]:
sql_auditoria = f"""
    CREATE DATABASE IF NOT EXISTS {catalog_name}.{schema_auditoria}
"""

spark.sql(sql_auditoria)

In [0]:
# Crear tabla de auditoría si no existe
spark.sql(f"""
  CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_auditoria}.ingestion_log (
    job_id BIGINT,
    job_run_id BIGINT,
    task_run_id BIGINT,
    job_start_time TIMESTAMP,
    job_end_time TIMESTAMP,
    job_duration_seconds BIGINT,
    job_status STRING,
    table STRING,
    layer STRING,
    rows_in BIGINT,
    rows_inserted BIGINT,
    rows_updated BIGINT,
    rows_deleted BIGINT,
    file_bytes BIGINT,
    merge_duration_seconds BIGINT,
    creation_date TIMESTAMP
) USING DELTA
""")

In [0]:
silver_clientes = (
  spark.table(f"{catalog_name}.{bronze_schema}.{input_table_name}")
  .withColumn("id_cliente",  F.col("id_cliente").cast("bigint"))
  .withColumn("nombre", initcap(trim(col("nombre"))))
  .withColumn(
    "email",
    F.when(
        F.col("email").contains("@"),
        initcap(trim(F.col("email")))
    ).otherwise(F.lit(None))
)
  .withColumn("ciudad", initcap(F.trim(F.col("ciudad"))))
  .withColumn(
        "fecha_registro",
        F.when(F.col("fecha_registro").rlike("^\d{4}-\d{2}-\d{2}$"),
               F.to_date("fecha_registro", "yyyy-MM-dd"))
         .when(F.col("fecha_registro").rlike("^\d{2}/\d{2}/\d{4}$"),
               F.to_date("fecha_registro", "dd/MM/yyyy"))
         .otherwise(F.lit(None))
    )
  .withColumn("updated_at", F.current_timestamp())
  .dropna(subset=["id_cliente"])
  .dropDuplicates(["id_cliente"])
)

In [0]:
# Crear tabla destino
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{silver_schema}.clientes (
  id_cliente BIGINT,
  nombre STRING,
  email STRING,
  ciudad STRING,
  fecha_registro DATE,
  updated_at TIMESTAMP
) USING DELTA
""")

In [0]:

delta_target = DeltaTable.forName(spark, f"{catalog_name}.{silver_schema}.{output_table_name}")

merge_start = time.time()
delta_target.alias("tgt") \
  .merge(
    source=silver_clientes.alias("src"),
    condition="tgt.id_cliente = src.id_cliente"
  ) \
  .whenMatchedUpdateAll(
    condition="src.updated_at > tgt.updated_at"
  ) \
  .whenNotMatchedInsertAll() \
  .execute()
merge_end = time.time()

In [0]:
spark.table(f"{catalog_name}.{silver_schema}.{output_table_name}").show(10)

In [0]:
import time
job_id = int(time.time())           # ID único basado en timestamp
job_run_id = int(time.time() * 100) # más granular
task_run_id = 1                     # o secuencial si prefieres


In [0]:
start = time.time()
rows_in = silver_clientes.count()
spark.sql(f"""
INSERT INTO {catalog_name}.{schema_auditoria}.ingestion_log VALUES (
  {job_id},{job_run_id},{task_run_id},
  current_timestamp(), current_timestamp(),
  {int(time.time()-start)},
  'SUCCESS','clientes','silver',
  {rows_in},{rows_in},0,0,0,{int(merge_end-merge_start)},current_timestamp()
)
""")


In [0]:
spark.table(f"{catalog_name}.{schema_auditoria}.ingestion_log").show(10)