In [0]:
bronze_df = (
    spark.read
    .format("delta")
    .table("bronze.order_payments")
)

In [0]:
display(bronze_df.limit(10))

In [0]:
from pyspark.sql.functions import col, sum

null_counts = bronze_df.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in bronze_df.columns
])

null_counts.show()

In [0]:
silver_df = (
    spark.read.table("bronze.order_payments")
    .filter("order_id IS NOT NULL")
    .dropDuplicates(["order_id"])
    .drop("shipping_limit_date")
    .drop("_rescued_data")
)
silver_df.display()


In [0]:
silver_df.schema

In [0]:
silver_df.createOrReplaceTempView("silver_updates")

In [0]:
if not spark.catalog.tableExists("silver.order_payments_cleaned"):
    (silver_df.write
        .format("delta") 
        .mode("overwrite") 
        .saveAsTable("silver.order_payments_cleaned"))
else:
    silver_df.createOrReplaceTempView("silver_updates")
    spark.sql("""
    MERGE INTO silver.order_payments_cleaned AS target
    USING silver_updates AS source
    ON target.order_id = source.order_id
    WHEN MATCHED THEN UPDATE SET *
    WHEN NOT MATCHED THEN INSERT *
    """)

In [0]:
%sql select * from silver.order_payments_cleaned limit 10;