In [0]:
from pyspark.sql import functions as F

In [0]:
bronze_df = (
    spark.read
    .format("delta")
    .table("bronze.geolocations")
)

In [0]:
display(bronze_df.limit(10))

In [0]:
from pyspark.sql.functions import col, sum

null_counts = bronze_df.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in bronze_df.columns
])

null_counts.show()

In [0]:
silver_df = (
    spark.read.table("bronze.geolocations")
    .withColumn("geolocation_sk", F.monotonically_increasing_id())
    .filter("geolocation_zip_code_prefix IS NOT NULL")
    .dropDuplicates(["geolocation_zip_code_prefix"])
    .drop("_rescued_data")
)
silver_df.display()


In [0]:
silver_df.printSchema()

In [0]:
silver_df.createOrReplaceTempView("silver_updates")

In [0]:
if not spark.catalog.tableExists("silver.geolocations_cleaned"):
    (silver_df.write 
        .format("delta") 
        .mode("overwrite") 
        .saveAsTable("silver.geolocations_cleaned"))
else:
    silver_df.createOrReplaceTempView("silver_updates")
    spark.sql("""
    MERGE INTO silver.geolocations_cleaned AS target
    USING silver_updates AS source
    ON target.geolocation_zip_code_prefix = source.geolocation_zip_code_prefix
    WHEN MATCHED THEN UPDATE SET *
    WHEN NOT MATCHED THEN INSERT *
    """)

In [0]:
%sql select * from silver.geolocations_cleaned limit 10;