In [0]:
spark.sql("USE globalretail_silver")
spark.sql("""
CREATE TABLE IF NOT EXISTS silver_orders (
    transaction_id STRING,
    customer_id STRING,
    product_id STRING,
    quantity INT,
    total_amount DOUBLE,
    transaction_date DATE,
    payment_method STRING,
    store_type STRING,
    order_status STRING,
    last_updated TIMESTAMP
) USING DELTA
""")

In [0]:
spark.sql("use globalretail_silver")
last_processed_df = spark.sql("select max(last_updated) as last_processed from silver_orders")

last_processed_timestamp = last_processed_df.collect()[0]['last_processed']

if last_processed_timestamp is None:
  last_processed_timestamp = '1900-01-01 00:00:00'


In [0]:
spark.sql(f"""
create or replace temporary view bronze_incremental_orders as
select * from globalretail_bronze.bronze_transaction  c
where c.ingestion_timestamp > '{last_processed_timestamp}'
""")

In [0]:
spark.sql("select * from bronze_incremental_orders").show()

In [0]:
display(spark.sql("select * from silver_orders"))

In [0]:
spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW silver_incremental_orders AS
SELECT 
    transaction_id,
    customer_id,
    product_id,
    CASE WHEN quantity < 0 THEN 0 ELSE quantity END AS quantity,
    CASE WHEN total_amount < 0 THEN 0 ELSE total_amount END AS total_amount,
    CAST(transaction_date AS DATE) AS transaction_date,
    payment_method,
    store_type,
    CASE 
        WHEN (quantity < 1 OR total_amount <= 0) THEN 'INVALID'
        ELSE 'VALID'
    END AS order_status,
    ingestion_timestamp AS last_updated
FROM bronze_incremental_orders
WHERE ingestion_timestamp > '{0}'
  AND transaction_date IS NOT NULL
  AND customer_id IS NOT NULL
  AND product_id IS NOT NULL
""".format(last_processed_timestamp))

In [0]:
display(spark.sql("select * from silver_incremental_orders"))

In [0]:
spark.sql("""
MERGE INTO silver_orders AS target
USING silver_incremental_orders AS source
ON target.transaction_id = source.transaction_id
WHEN MATCHED THEN
  UPDATE SET
    target.customer_id = source.customer_id,
    target.product_id = source.product_id,
    target.quantity = source.quantity,
    target.total_amount = source.total_amount,
    target.transaction_date = source.transaction_date,
    target.payment_method = source.payment_method,
    target.store_type = source.store_type,
    target.order_status = source.order_status,
    target.last_updated = source.last_updated
WHEN NOT MATCHED THEN
  INSERT (
    transaction_id,
    customer_id,
    product_id,
    quantity,
    total_amount,
    transaction_date,
    payment_method,
    store_type,
    order_status,
    last_updated
  )
  VALUES (
    source.transaction_id,
    source.customer_id,
    source.product_id,
    source.quantity,
    source.total_amount,
    source.transaction_date,
    source.payment_method,
    source.store_type,
    source.order_status,
    source.last_updated
  )
""")

In [0]:
display(spark.sql("select * from silver_orders"))