Setting the things up

In [0]:
import pyspark.sql.functions as F
from pyspark.sql import Window as W
import pyspark.sql.types as T


In [0]:
# Install the H3 library (only once per cluster)
%pip install h3

import h3
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

Loading Data

In [0]:
df_franchises = spark.read.table("samples.bakehouse.sales_franchises")
df_franchises.printSchema()
df_franchises.limit(3).toPandas()

In [0]:
(
    df_franchises
        .groupBy('size')
        .agg(
            F.min('longitude').alias('min_longitude'),
            F.max('longitude').alias('max_longitude'),
            F.min('latitude').alias('min_latitude'),
            F.max('latitude').alias('max_latitude')
        ).distinct()
        .orderBy('size')
        .show(50)
)

`latitude/longitude` extreme values + `size` values look good

In [0]:
df_franchises.select('country', 'city').distinct().orderBy('country', 'city').show(50)

cities look good

In [0]:
( # Count NULL values per each column of a dataframe
    df_franchises
        .select(
            [F.count(F.when(F.col(c).isNull(),  c)).alias(c) for c in df_franchises.columns]
        ).toPandas()
)

In [0]:
( # Count distinct values per each column of a dataframe
    df_franchises
        .select(
            [F.countDistinct(F.col(c)).alias(c) for c in df_franchises.columns]
        ).toPandas()
)

Transformations

In [0]:
# Standardizing some fields
def clean_str(colname):
    return F.initcap(F.trim(colname))

df_franchises_standardized = (
    df_franchises
        .withColumn("name", clean_str("name")) 
        .withColumn("city", clean_str("city")) 
        .withColumn("district", clean_str("district")) 
        .withColumn("country", F.when(F.col("country") == "US", "United States").otherwise(F.col("country")))
        .withColumn("country", clean_str("country")) 
)

df_franchises_standardized.limit(10).toPandas()

add a geohash or location_h3 index for spatial joins/analytics

In [0]:
# h3.latlng_to_cell(lat, lon, resolution)
h3_udf = F.udf(lambda lat, lon: h3.latlng_to_cell(lat, lon, 9), StringType())

df_franchises_standardized_h3 = (
    df_franchises_standardized
        .withColumn("h3_9", h3_udf(F.col("latitude"), F.col("longitude")))
)

df_franchises_standardized_h3.limit(10).toPandas()

Referential Integrity w/ Suppliers

In [0]:
df_franchises_standardized_h3.createOrReplaceTempView("franchises")

query = """
SELECT f.*
FROM franchises f
LEFT JOIN samples.bakehouse.sales_suppliers s
  ON f.supplierID = s.supplierID
WHERE
  s.supplierID IS NULL
"""

df_orphan_franchises = spark.sql(query)
df_orphan_franchises.limit(10).toPandas()
#df_orphan_franchises.count()

In [0]:
# Create the schema if it does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS dq_logs")

dq_table = "dq_logs.orphan_franchises"

(df_orphan_franchises
    .withColumn("logged_at", F.current_timestamp())
    .write
    .mode("append")
    .format("delta")
    .saveAsTable(dq_table)
)
