In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

## Get last run time

In [0]:
last_ingest_df = spark.sql("SELECT last_timestamp FROM control.ctl.control_dates WHERE stage_name = 'silver_transformation'")
last_ingest_time = last_ingest_df.collect()[0]['last_timestamp']
spark.conf.set("last_ingest_time", str(last_ingest_time))

## Reading Data

In [0]:
%sql
USE CATALOG bronze;

In [0]:
df = spark.read.table("products.products").filter(col("ingest_timestamp") > spark.conf.get("last_ingest_time"))
df.limit(100).display()

In [0]:
df = df.withColumn("price", abs(col("price"))).withColumn("category", when((col("category") == "") | col("category").isNull(), "UNKNOWN").otherwise(col("category")))

Normalize the new records into the same schema as the live tables, include surrogate key, create_date, last_updated <br>
Split new records into:
- to be updated
- new

Then:<br>
Merge records to be updated <br>
Insert records to be updated <br>
Mark any categories as inactive if they only have inactive products <br>

In [0]:
df_categories = df.select("category", "ingest_timestamp").distinct().withColumn("active_flag", lit(True))
df_product_categories = df.select("product_id", "category", "ingest_timestamp")
df_products = df.select("product_id", "product_name", "price", "active_flag", "ingest_timestamp")

In [0]:
if spark.catalog.tableExists("silver.products.products"):
    # Get existing products (only relevant columns)
    df_products_existing = spark.sql("SELECT product_key, product_id, date_created, last_updated FROM silver.products.products")
    # Find current max value for surrogate key
    max_prod_key = spark.sql("SELECT MAX(product_key) AS max_prod_key FROM silver.products.products").collect()[0]["max_prod_key"]
else:
    # Create empty df with relevant columns to match above
    df_products_existing = df_products.select("product_id")\
        .withColumn("product_key", lit(None)).withColumn("product_key", col("product_key").cast(IntegerType()))\
        .withColumn("date_created", lit('1900-01-01T00:00:00')).withColumn("date_created", col("date_created").cast(TimestampType()))\
        .withColumn("last_updated", lit('1900-01-01T00:00:00')).withColumn("last_updated", col("last_updated").cast(TimestampType()))\
        .filter(col("product_key").isNotNull())
    max_prod_key = 0

# Renaming columns before join to avoid any conflicts
df_products_existing = df_products_existing.withColumnRenamed("product_key", "existing_product_key")\
    .withColumnRenamed("product_id", "existing_product_id")\
    .withColumnRenamed("date_created", "existing_date_created")\
    .withColumnRenamed("last_updated", "existing_last_updated")

# Left join to split new products and existing products
df_products_joined = df_products.join(df_products_existing, on=df_products.product_id == df_products_existing.existing_product_id, how="left")
df_products_new = df_products_joined.filter(col("existing_product_key").isNull())
df_products_existing = df_products_joined.filter(col("existing_product_key").isNotNull())

# Existing products already have a product_key, date_created, and last_updated
# So we remove the extra product_id column
# Then remove the "existing" from the column names
# And set last_updated to ingest_timestamp as this will be when the record has most recently been updated
df_products_existing = df_products_existing.drop("existing_product_id", "existing_last_updated")\
    .withColumnRenamed("ingest_timestamp", "last_updated")\
    .withColumnRenamed("existing_product_key", "product_key")\
    .withColumnRenamed("existing_date_created", "date_created")

# For new products, we need to assign a product_key
# Which we do using row_number() over ordering by product_id
# Adding on the max product_key that already exists to ensure uniqueness
window_spec_prod = Window.orderBy("product_id")
# The new products get their product_key, date_created, and last_updated columns created
# Before dropping all the other columns
df_products_new = df_products_new\
    .withColumn("date_created", col("ingest_timestamp"))\
    .withColumn("last_updated", col("ingest_timestamp"))\
    .withColumn("product_key", row_number().over(window_spec_prod) + lit(max_prod_key))\
    .drop("existing_product_id", "existing_last_updated", "existing_product_key", "existing_date_created", "ingest_timestamp")

df_products_new.display()
df_products_existing.display()

In [0]:
if spark.catalog.tableExists("silver.products.categories_lookup"):
    # Create df with existing categories
    df_categories_existing = spark.sql("SELECT category_key, category, date_created, last_updated FROM silver.products.categories")
    max_cat_key = spark.sql("SELECT MAX(category_key) AS max_cat_key FROM silver.products.category_lookup").collect()[0]["max_cat_key"]
else:
    # Create empty df with relevant columns
    df_categories_existing = df_categories.select("category")\
        .withColumn("category_key", lit(None)).withColumn("category_key", col("category_key").cast(IntegerType()))\
        .withColumn("date_created", lit('1900-01-01T00:00:00')).withColumn("date_created", col("date_created").cast(TimestampType()))\
        .withColumn("last_updated", lit('1900-01-01T00:00:00')).withColumn("last_updated", col("last_updated").cast(TimestampType()))\
        .filter(col("category_key").isNotNull())
    max_cat_key = 0

df_categories_existing = df_categories_existing.withColumnRenamed("category_key", "existing_category_key")\
    .withColumnRenamed("category", "existing_category")\
    .withColumnRenamed("date_created", "existing_date_created")\
    .withColumnRenamed("last_updated", "existing_last_updated")

df_categories_joined = df_categories.join(df_categories_existing, on=df_categories.category == df_categories_existing.existing_category, how="left")
df_categories_new = df_categories_joined.filter(col("existing_category_key").isNull())
df_categories_existing = df_categories_joined.filter(col("existing_category_key").isNotNull())

df_categories_existing = df_categories_existing.drop("existing_last_updated", "existing_category")\
    .withColumnRenamed("ingest_timestamp", "last_updated")\
    .withColumnRenamed("existing_category_key", "category_key")\
    .withColumnRenamed("existing_date_created", "date_created")

window_spec_cat = Window.orderBy("category")

df_categories_new = df_categories_new\
    .withColumn("last_updated", col("ingest_timestamp"))\
    .withColumn("date_created", col("ingest_timestamp"))\
    .withColumn("category_key", row_number().over(window_spec_cat) + lit(max_cat_key))\
    .drop("existing_category", "existing_category_key", "existing_date_created", "existing_last_updated", "ingest_timestamp")

df_categories_new.display()
df_categories_existing.display()