In [0]:
from pyspark.sql.functions import when, coalesce, to_date, substring, initcap, trim, col, lit, lower, current_timestamp, to_timestamp
from pyspark.sql.types import LongType, StringType, DateType
from datetime import datetime

In [0]:
def parse_fecha_registro_safe(fecha_col: str):
    col_trimmed = trim(col(fecha_col))
    
    return coalesce(
        
        when(col_trimmed.rlike("^\\d{4}-\\d{2}-\\d{2}$"),
               to_date(col_trimmed, 'yyyy-MM-dd')),
        
        
        when(col_trimmed.rlike("^\\d{2}/\\d{2}/\\d{4}$"),
               to_date(col_trimmed, 'dd/MM/yyyy')),
        
        
        when(col_trimmed.rlike("^\\d{2}-\\d{2}-\\d{4}$"),
               to_date(col_trimmed, 'MM-dd-yyyy')),
        
        
        when(col_trimmed.rlike("^\\d{4}/\\d{2}/\\d{2}"),
               to_date(substring(col_trimmed, 1, 10), 'yyyy/MM/dd')),
        
        
        when(col_trimmed.rlike("^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}$"),
               to_date(substring(col_trimmed, 1, 10), 'yyyy-MM-dd')),
        
        
        when(col_trimmed.rlike("^\\d{2}/\\d{2}/\\d{4} \\d{2}:\\d{2}:\\d{2}$"),
               to_date(substring(col_trimmed, 1, 10), 'dd/MM/yyyy')),
        
        
        when(col_trimmed.rlike("^\\d{2}/\\d{2}/\\d{4}$"),
               to_date(col_trimmed, 'MM/dd/yyyy')),
        
        lit(None).cast(DateType())
    )

In [0]:
catalog_name = "webinar"
email_pattern = "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
fecha_actual = datetime.now().date()

df = (
    spark.table(f"{catalog_name}.bronze.clientes")
    .withColumn("nombre", initcap(trim(col("nombre"))))
    .withColumn("ciudad",
        when(col("ciudad").isNull(), None)
        .otherwise(initcap(trim(col("ciudad"))))
    )
    .withColumn("email",
        when(col("email").isNull(), None)
        .when(lower(trim(col("email"))) == "null", None)
        .otherwise(lower(trim(col("email"))))
    )
    .withColumn("email",
        when(col("email").rlike(email_pattern), col("email"))
        .otherwise(None)
    )
    .withColumn("fecha_registro", parse_fecha_registro_safe("fecha_registro"))
    .withColumn(
            "fecha_registro",
            when(
                (col("fecha_registro") > lit(fecha_actual)) | 
                (col("fecha_registro") < lit("1900-01-01")),
                None
            ).otherwise(col("fecha_registro"))
    )
    .dropDuplicates(["id_cliente"])
    .withColumn("updated_at", current_timestamp())
    .select(
            col("id_cliente").cast(LongType()),
            col("nombre").cast(StringType()),
            col("email").cast(StringType()),
            col("ciudad").cast(StringType()),
            col("fecha_registro").cast(DateType()),
            col("updated_at")
        )
)

df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.silver.clientes")