In [0]:
from pyspark.sql.functions import col, round, trim, upper, length, to_timestamp, when

ruta_bronze = "/Volumes/workspace/default/retail_lakehouse/bronze"
df_bronze = spark.read.format("delta").load(ruta_bronze)

df_silver_clean = (
    df_bronze.dropna(subset=["InvoiceNo", "StockCode", "Quantity", "UnitPrice", "CustomerID", "InvoiceDate"])
             .filter((col("Quantity") > 0) & (col("UnitPrice") > 0))
)

df_silver_clean = (
    df_silver_clean.withColumn("Description", trim(upper(col("Description"))))
                   .withColumn("Country", trim(upper(col("Country"))))
)

df_silver_clean = df_silver_clean.withColumn("InvoiceDate", to_timestamp(col("InvoiceDate"), 'dd-MM-yyyy HH:mm'))


df_silver_clean = df_silver_clean.withColumn("TotalVenta", round(col("Quantity") * col("UnitPrice"), 2))


df_silver_clean = df_silver_clean.withColumn(
    "DescripcionValida", when(length(col("Description")) > 0, "SI").otherwise("NO")
)


df_silver_clean = df_silver_clean.dropDuplicates()

ruta_silver = "/Volumes/workspace/default/retail_lakehouse/silver"
df_silver_clean.write.format("delta").mode("append").save(ruta_silver)

df_silver_clean.createOrReplaceTempView("silver_online_retail")

In [0]:
from pyspark.sql.functions import max, countDistinct, col

# Total de registros procesados
total = df_silver_clean.count()

# Total de países distintos
paises = df_silver_clean.select("Country").distinct().count()

# Última fecha de venta registrada
ultima_fecha = df_silver_clean.select(max("InvoiceDate")).collect()[0][0]

# Mostrar resumen con estilo
print("📦 CAPA SILVER – RESUMEN DEL PROCESO ETL")
print(f"✅ Registros procesados exitosamente: {total}")
print(f"🌍 Número de países únicos: {paises}")
print(f"🕒 Última venta registrada: {ultima_fecha}")


# Vista previa de los últimos 5 registros por fecha
print("\n🔍 Vista previa de las últimas 5 ventas registradas:")
display(df_silver_clean.orderBy(col("InvoiceDate").desc()).limit(5))