In [0]:

from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, trim, initcap, when, lit, year, month, dayofmonth, concat, concat_ws
import time

In [0]:
catalog_name = "retail_dev"
schema_silver = "silver"
schema_bronze = "bronze"

In [0]:
sql_catalog = f"""
    CREATE CATALOG IF NOT EXISTS {catalog_name}
"""

spark.sql(sql_catalog)

In [0]:
sql_silver = f"""
    CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_silver}
"""

spark.sql(sql_silver)

In [0]:
silver_tiendas = (
    spark.table(f"{catalog_name}.{schema_bronze}.tiendas_raw")
    .withColumn("id_tienda", F.col("id_tienda").cast("bigint"))
    .withColumn("nombre", initcap(F.trim(F.col("nombre"))))
    .withColumn("ciudad", initcap(F.trim(F.col("ciudad"))))
    .withColumn("region", initcap(F.trim(F.col("region"))))
    .withColumn("updated_at", F.current_timestamp())
    .dropna(subset=["id_tienda"])
    .dropDuplicates(["id_tienda"])
)

In [0]:
display(silver_tiendas)

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_silver}.tiendas (
  id_tienda BIGINT,
  nombre STRING,
  ciudad STRING,
  region STRING,
  updated_at TIMESTAMP
) USING DELTA
""")

In [0]:
silver_tiendas.createOrReplaceTempView("silver_tiendas")
merge_start = time.time()
spark.sql(f"""
MERGE INTO {catalog_name}.{schema_silver}.tiendas tgt
USING silver_tiendas src
ON tgt.id_tienda = src.id_tienda
WHEN MATCHED AND src.updated_at > tgt.updated_at THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""")
merge_end = time.time()

In [0]:
spark.table(f"{catalog_name}.{schema_silver}.tiendas").show(10)

In [0]:
import time
job_id = int(time.time())           # ID único basado en timestamp
job_run_id = int(time.time() * 100) # más granular
task_run_id = 1                     # o secuencial si prefieres


In [0]:
rows_in = silver_tiendas.count()
start = time.time()
spark.sql(f"""
INSERT INTO {catalog_name}.{schema_auditoria}.ingestion_log VALUES (
  {job_id},{job_run_id},{task_run_id},
  current_timestamp(), current_timestamp(),
  {int(time.time()-start)},
  'SUCCESS','tiendas','silver',
  {rows_in},{rows_in},0,0,0,{int(merge_end-merge_start)},current_timestamp()
)
""")


In [0]:
display(spark.table(f"{catalog_name}.{schema_auditoria}.ingestion_log"))


In [0]:
start = time.time()
df = spark.table("retail_dev.bronze.tiendas_raw")

df_cast = (
    df
    .withColumn("id_tienda", F.col("id_tienda").cast("bigint"))
    .withColumn("nombre", F.trim(F.col("nombre")))
    .withColumn("ciudad", F.trim(F.col("ciudad")))
    .withColumn("region", F.trim(F.col("region")))
    .withColumn("updated_at", F.to_timestamp("updated_at"))
)

w = Window.partitionBy("id_tienda").orderBy(F.col("updated_at").desc_nulls_last())
df_dedup = df_cast.withColumn("rn", F.row_number().over(w)).filter("rn=1").drop("rn")

spark.sql("""
CREATE TABLE IF NOT EXISTS retail_dev.silver.tiendas (
  id_tienda BIGINT,
  nombre STRING,
  ciudad STRING,
  region STRING,
  updated_at TIMESTAMP
) USING DELTA
""")

df_dedup.createOrReplaceTempView("src_tiendas")
merge_start = time.time()
spark.sql("""
MERGE INTO retail_dev.silver.tiendas tgt
USING src_tiendas src
ON tgt.id_tienda = src.id_tienda
WHEN MATCHED AND src.updated_at > tgt.updated_at THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""")
merge_end = time.time()

rows_in = df_dedup.count()
spark.sql(f"""
INSERT INTO retail_dev.auditoria.ingestion_log VALUES (
  {job_id},{job_run_id},{task_run_id},
  current_timestamp(), current_timestamp(),
  {int(time.time()-start)},
  'SUCCESS','tiendas','silver',
  {rows_in},{rows_in},0,0,0,{int(merge_end-merge_start)},current_timestamp()
)
""")
