In [2]:
# Welcome to your new notebook
# Type here in the cell editor to add code!
spark.sql("""
    CREATE TABLE IF NOT EXISTS silverlayer.silver_customers (
    customer_id STRING,
    name STRING,
    email STRING,
    country STRING,
    customer_type STRING,
    registration_date DATE,
    age INT,
    gender STRING,
    total_purchases INT,
    customer_segment STRING,
    days_since_registration INT,
    last_updated TIMESTAMP)
""")

StatementMeta(, 13d509d8-e386-47c4-a02b-e2d8de4b7cae, 4, Finished, Available, Finished)

DataFrame[]

In [3]:
last_processed_df = spark.sql("SELECT MAX(last_updated) as last_processed FROM silver_customers")
last_processed_timestamp = last_processed_df.collect()[0]['last_processed']


if last_processed_timestamp is None:
    last_processed_timestamp = "1900-01-01T00:00:00.000+00:00"


StatementMeta(, 13d509d8-e386-47c4-a02b-e2d8de4b7cae, 5, Finished, Available, Finished)

In [8]:
# Create a temporary view of incremental bronze data
spark.sql(f"""
CREATE OR REPLACE TEMPORARY VIEW bronze_incremental AS
SELECT *
FROM bronzelayer.customer c where  c.ingestion_timestamp > '{last_processed_timestamp}'
""")


StatementMeta(, 13d509d8-e386-47c4-a02b-e2d8de4b7cae, 10, Finished, Available, Finished)

DataFrame[]

In [9]:
spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW silver_incremental AS
SELECT
    customer_id,
    name,
    email,
    country,
    customer_type,
    registration_date,
    age,
    gender,
    total_purchases,
    CASE
        WHEN total_purchases > 10000 THEN 'High Value'
        WHEN total_purchases > 5000 THEN 'Medium Value'
        ELSE 'Low Value'
    END AS customer_segment,
    DATEDIFF(CURRENT_DATE(), registration_date) AS days_since_registration,
    CURRENT_TIMESTAMP() AS last_updated
FROM bronze_incremental
WHERE 
    age BETWEEN 18 AND 100
    AND email IS NOT NULL
    AND total_purchases >= 0
""")


StatementMeta(, 13d509d8-e386-47c4-a02b-e2d8de4b7cae, 11, Finished, Available, Finished)

DataFrame[]

In [10]:
spark.sql("""
MERGE INTO silver_customers target
USING silver_incremental source
ON target.customer_id = source.customer_id
WHEN MATCHED THEN
    UPDATE SET *
WHEN NOT MATCHED THEN
    INSERT *
""")


StatementMeta(, 13d509d8-e386-47c4-a02b-e2d8de4b7cae, 12, Finished, Available, Finished)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [12]:
# Read and verify the Silver layer customer data
spark.sql("select * from silver_customers").show()


StatementMeta(, 13d509d8-e386-47c4-a02b-e2d8de4b7cae, 14, Finished, Available, Finished)

+-----------+------------+--------------------+---------+-------------+-----------------+---+------+---------------+----------------+-----------------------+--------------------+
|customer_id|        name|               email|  country|customer_type|registration_date|age|gender|total_purchases|customer_segment|days_since_registration|        last_updated|
+-----------+------------+--------------------+---------+-------------+-----------------+---+------+---------------+----------------+-----------------------+--------------------+
|         12| Customer 12|customer12@exampl...|Australia|          VIP|       2019-06-16| 65| Other|            758|       Low Value|                   1931|2024-09-28 12:29:...|
|         38| Customer 38|customer38@exampl...|    India|          VIP|       2021-10-01| 38| Other|             82|       Low Value|                   1093|2024-09-28 12:29:...|
|         46| Customer 46|customer46@exampl...|   France|          VIP|       2023-03-20| 78| Other|     