In [0]:

from pyspark.sql.functions import (
    col, upper, trim, current_timestamp, lit, when,
    regexp_replace, length, coalesce, concat_ws, 
    year, datediff, to_date
)
from pyspark.sql.types import IntegerType, StringType

spark.sql("CREATE DATABASE IF NOT EXISTS silver")

orders_bronze = spark.table("bronze.orders")
total_orders_bronze = orders_bronze.count()
unique_orders_bronze = orders_bronze.select("order_id").distinct().count()
print(f"Total rows: {orders_bronze.count()}")
print(f"Null order_id : {orders_bronze.filter(col("order_id").isNull()).count()}")
print(f"Duplicates order_id : {orders_bronze.groupBy("order_id").count().filter(col("count") > 1).count()}")
if total_orders_bronze == unique_orders_bronze:
    print("All order_id are unique")
else:
    print("order_id are not unique")

orders_bronze.printSchema()
display(orders_bronze.limit(15))

In [0]:
orders_clean = orders_bronze \
    .withColumn("order_status", upper(trim(col("order_status")))) \

display(orders_clean.limit(5))



In [0]:
orders_validated = orders_clean.filter(
    #Purchase antes de approved (o approved es null)
    (col("order_approved_at").isNull()) | 
    (col("order_purchase_timestamp") <= col("order_approved_at"))
).filter(
    # Approved antes de carrier (o carrier es null)
    (col("order_delivered_carrier_date").isNull()) | 
    (col("order_approved_at").isNull()) |
    (col("order_approved_at") <= col("order_delivered_carrier_date"))
).filter(
    # Carrier antes de customer (o customer es null)
    (col("order_delivered_customer_date").isNull()) | 
    (col("order_delivered_carrier_date").isNull()) |
    (col("order_delivered_carrier_date") <= col("order_delivered_customer_date"))
).filter(col("order_id").isNotNull()) \
 .filter(col("customer_id").isNotNull()) \
 .filter(col("order_status").isNotNull())

print(f"Before validation: {orders_clean.count()}")
print(f"After validation: {orders_validated.count()}")
print(f"Filtered rows: {orders_clean.count() - orders_validated.count()}")
display(orders_validated.limit(5))

filtered_rows = orders_clean.subtract(orders_validated)
display(filtered_rows.select(
    "order_purchase_timestamp",
    "order_approved_at", 
    "order_delivered_carrier_date",
    "order_delivered_customer_date"
).limit(10))

In [0]:
orders_final = orders_validated \
    .withColumn("processed_at", current_timestamp()) \
    .withColumn("data_source", lit("olist")) \
    .withColumn("data_layer", lit("silver")) \
    .withColumn("data_status", lit("cleaned")) 



orders_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("silver.orders")


print("Silver order  table is now created")
print(f"Total rows: {spark.table('silver.orders').count()}")
