### BRONZE TO SILVER LAYER TRANSFORMATION FOR EMEKA AND A SONS DATA

###### Import the relevant packages

In [44]:
# IMPORT RELEVANT PACKAGES

from pyspark.sql import functions as F 
from pyspark.sql.types import StringType, IntegralType, BooleanType, DataType, TimestampType, DecimalType, LongType, IntegerType

lakehouse_name = "Emeka_and_sons"# Keep this for reference if needed elsewhere
bronze_schema = "dbo"           
silver_schema = "silver"

bronze_table_prefix = f"{bronze_schema}.source_"
silver_table_prefix = f"{silver_schema}.source_"

print(f"Starting Bronze to silver transformations for Lakehouse: {lakehouse_name}")

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 46, Finished, Available, Finished)

Starting Bronze to silver transformations for Lakehouse: Emeka_and_sons


This is a Python notebook to transform data from the bronze layer for emeka and sons customer loyalty project


In [21]:
df_bronze_customers = spark.read.format("delta").table("dbo.source_customer_signups")

df_bronze_customers.head()

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 23, Finished, Available, Finished)

Row(CustomerID=7090235964, FirstName='Chiamaka', LastName='Aliyu', EmailAddress=None, SignUpTimestamp=datetime.datetime(2023, 1, 9, 1, 2, 46), ConsentEmail=False, ConsentSMS=True, InitialStore='ST003')

In [22]:
display(df_bronze_customers.limit(5))

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 24, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 96d50df8-a761-4410-80d8-30c64d639adf)

In [23]:
# PERFORM RENAMING AND CHANGING VARIABLE TYPE

from pyspark.sql.functions import col, lower, regexp_replace, to_timestamp, trim

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 25, Finished, Available, Finished)

In [24]:
df_silver_customers = df_bronze_customers.select(
    col("CustomerID").alias("customer_id"), 
    trim(col("FirstName")).alias("first_name"),
    trim(col("LastName")).alias("last_name"),
    lower(trim(col("EmailAddress"))).alias("email_address"),
    col("SignUpTimestamp").cast("timestamp").alias("signup_ts"),
    col("ConsentEmail").cast("boolean").alias("has_email_consent"),
    col("ConsentSMS").cast("boolean").alias("has_sms_consent"),
    col("InitialStore").alias("signup_store_id")
).distinct()

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 26, Finished, Available, Finished)

In [25]:
display(df_silver_customers.limit(5))

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 27, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 6e2ae4b8-de1a-4131-a273-59c21b5f5702)

Read and load data from the staging table

1. ## Clean and load bronze data () to the silver layer

In [26]:
bronze_table_name = f"{bronze_table_prefix}store_master"
df_bronze_stores = spark.read.format("delta").table(bronze_table_name)

print(f"Processing {bronze_table_name} -> {silver_table_prefix}stores")



StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 28, Finished, Available, Finished)

Processing dbo.source_store_master -> silver.source_stores


In [27]:
display(df_bronze_stores.limit(5))

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 29, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 5438c2fc-9033-4c06-8ffc-e5403d5f3a03)

In [28]:
# apply transformations

df_silver_stores = df_bronze_stores.select(
    col("StoreID").alias("store_id"),
    col("LocationName").alias("location_name"),
    col("StoreManager").alias("Store_manager"),
    col("SizeSQM").alias("size_sqm"),
    col("OpenDate").alias("open_date")
    
    
)

# write to silver layer

silver_table_name = f"{silver_table_prefix}stores"



StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 30, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 712ecde9-1037-4b5f-88aa-be8a518e8d28)

In [31]:
# load bronze product data

bronze_table_name = f"{bronze_table_prefix}product_master"

df_bronze_products = spark.read.format("delta").table(bronze_table_name)

print(f"Processing {bronze_table_name} -> {silver_table_prefix}products")



StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 33, Finished, Available, Finished)

Processing dbo.source_product_master -> silver.source_products


SynapseWidget(Synapse.DataFrame, 958297ae-ba15-4459-8a8c-0b28ab8f9d39)

In [32]:
# apply transformations

df_silver_products = df_bronze_products.select(
    col("ProductID").alias("product_id"),
    col("ProductDescription").alias("product_description"),
    col("Category").alias("category"),
    col("Brand").alias("brand"),
    col("Supplier").alias("supplier"),
    col("CurrentStandardUnitPrice").alias("current_unit_price"),
    col("CurrentUnitCost").alias("current_unit_cost"),
    col("DateAdded").alias("date_added"),
    col("IsActive").alias("is_active")
)

# write to silver layer

silver_table_name = f"{silver_table_prefix}products"


StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 34, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e2adb32d-6557-418e-975d-d19208fabac2)

In [34]:
# load the bronze customer data

bronze_table_name = f"{bronze_table_prefix}customer_signups"
df_bronze_customers = spark.read.format("delta").table(bronze_table_name)

print(f"Processing {bronze_table_name} -> {silver_table_prefix}customers")



StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 36, Finished, Available, Finished)

Processing dbo.source_customer_signups -> silver.source_customers


SynapseWidget(Synapse.DataFrame, b5cfb461-77e2-40f7-803a-99ff5c15636e)

In [35]:
# Apply transformations

df_silver_customers = df_bronze_customers.select(
    col("CustomerID").alias("customer_id"),
    col("FirstName").alias("first_name"),
    col("LastName").alias("last_name"),
    col("EmailAddress").alias("email_address"),
    col("SignUpTimestamp").alias("signup_ts"),
    col("ConsentEmail").alias("consent_email"),
    col("ConsentSMS").alias("consent_sms"),
    col("InitialStore").alias("initial_store")
).distinct()

silver_table_name = f"{silver_table_prefix}customers"



StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 37, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 77696ffe-f715-4bd8-b6cf-10822a79ad54)

In [36]:
# load bronze promotion data

bronze_table_name = f"{bronze_table_prefix}promotion_definitions"

df_bronze_promotions = spark.read.format("delta").table(bronze_table_name)



StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 38, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e18b7c5a-2fc1-4fef-9474-86d5d2ce9179)

In [37]:
# apply transformations

df_silver_promotions = df_bronze_promotions.select(
    col("PromotionCode").alias("promotion_code"),
    col("PromotionName").alias("promotion_name"),
    col("PromotionType").alias("promotion_type"),
    col("DiscountValue").alias("discount_value"),
    col("StartDate").alias("start_date"),
    col("Enddate").alias("end_date")
)

# WRITE TO SILVER LAYER

silver_table_name = f"{silver_table_prefix}promotions"



StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 39, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, d7063f4c-1348-43d8-bb6e-bff45c202154)

In [38]:
# TRANSFORM POS TRANSACTIONS (JOIN HEADER AND LINES TO SILVER TRANSACTION DETAILS)

# LOAD BRONZE POS HEADER DATA

bronze_header_table = f"{bronze_table_prefix}pos_transactions_header"

df_header = spark.read.format("delta").table(bronze_header_table)

# LOAD BRONZE POS LINES DATA

bronze_line_table = f"{bronze_table_prefix}pos_transaction_lines"

df_lines = spark.read.format("delta").table(bronze_line_table)

display(df_header.limit(5))
display(df_lines.limit(5))

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 40, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, b309e529-88c0-44e5-8ade-8b3e976f1c9a)

SynapseWidget(Synapse.DataFrame, eefab8a9-4e78-4bfd-8cd9-88a06ddca265)

In [46]:
# prepare header data

df_header_prep = df_header.select(
    col("TransactionID").cast(StringType()).alias("transaction_id"),
    col("StoreID").cast(StringType()).alias("store_id"),
    col("CustomerID").cast(LongType()).alias("customer_id"),
    col("TransactionTimestamp").cast(TimestampType()).alias("transaction_ts"),
    col("TransactionType").cast(StringType()).alias("transaction_type"),
    col("TotalAmount").cast(DecimalType(12,2)).alias("transaction_amount"),
    col("CashierID").cast(StringType()).alias("cashier_id")

)

# prepare lines data

df_lines_prep = df_lines.select(
    col("TransactionLineID").cast(StringType()).alias("transaction_line_id"),
    col("TransactionID").cast(StringType()).alias("transaction_id"),
    col("ProductID").cast(StringType()).alias("product_id"),
    col("Quantity").cast(IntegerType()).alias("quantity"),
    col("UnitPriceRecorded").cast(DecimalType(10,2)).alias("unit_price_recorded"),
    col("LineItemTotal").cast(DecimalType(12,2)).alias("line_item_gross_amount"),
    col("PromotionCodeApplied").cast(StringType()).alias("promotion_code_applied")

)

# JOIN HEADER AND LINES USING INNER JOIN

df_silver_details = df_header_prep.join(
    df_lines_prep,
    on = "transaction_id",
    how = "inner"
)

# SELECT THE FINAL COLUMNS FOR THE SILVER DETAIL TABLE

df_silver_details = df_silver_details.select(
    "transaction_id",
    "transaction_line_id",
    "transaction_ts",
    "store_id",
    "customer_id",
    "product_id",
    "cashier_id",
    "transaction_type",
    "quantity",
    "unit_price_recorded",
    "line_item_gross_amount",
    "promotion_code_applied"

)


display(df_silver_details.limit(10))

df_silver_details.write.format("delta").mode("overwrite").saveAsTable(silver_schema)

print(f"Successfully wrote DataFrame to Silver table: {silver_schema}")


StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 48, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c70ba55f-c3d9-4bbe-8394-d36566572896)

Successfully wrote DataFrame to Silver table: silver


In [47]:
# defined earlier in your notebook
silver_schema = "silver"
print(f"Target Silver Schema: {silver_schema}")
print("-" * 30)

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 49, Finished, Available, Finished)

Target Silver Schema: silver
------------------------------


In [48]:
# Construct the full table name including the schema
silver_table_full_name = f"{silver_schema}.silver_customers"
print(f"Attempting to save df_silver_customers to: {silver_table_full_name}")

# Save the DataFrame
df_silver_customers.write.format("delta").mode("overwrite").saveAsTable(silver_table_full_name)

print(f"Successfully saved: {silver_table_full_name}")

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 50, Finished, Available, Finished)

Attempting to save df_silver_customers to: silver.silver_customers
Successfully saved: silver.silver_customers


In [49]:
# Construct the full table name including the schema
silver_table_full_name = f"{silver_schema}.silver_stores"
print(f"Attempting to save df_silver_stores to: {silver_table_full_name}")

# Save the DataFrame
df_silver_stores.write.format("delta").mode("overwrite").saveAsTable(silver_table_full_name)

print(f"Successfully saved: {silver_table_full_name}")

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 51, Finished, Available, Finished)

Attempting to save df_silver_stores to: silver.silver_stores
Successfully saved: silver.silver_stores


In [50]:
# Construct the full table name including the schema
silver_table_full_name = f"{silver_schema}.silver_products"
print(f"Attempting to save df_silver_products to: {silver_table_full_name}")

# Save the DataFrame
df_silver_products.write.format("delta").mode("overwrite").saveAsTable(silver_table_full_name)

print(f"Successfully saved: {silver_table_full_name}")

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 52, Finished, Available, Finished)

Attempting to save df_silver_products to: silver.silver_products
Successfully saved: silver.silver_products


In [51]:
# Construct the full table name including the schema
silver_table_full_name = f"{silver_schema}.silver_promotions"
print(f"Attempting to save df_silver_promotions to: {silver_table_full_name}")

# Save the DataFrame
df_silver_promotions.write.format("delta").mode("overwrite").saveAsTable(silver_table_full_name)

print(f"Successfully saved: {silver_table_full_name}")

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 53, Finished, Available, Finished)

Attempting to save df_silver_promotions to: silver.silver_promotions
Successfully saved: silver.silver_promotions


In [55]:
silver_table_full_name = f"{silver_schema}.silver_transaction_details"
print(f"Attempting to save df_silver_details to: {silver_table_full_name}")

df_silver_details.write.format("delta").mode("overwrite").saveAsTable(silver_table_full_name)

print(f"Successfully saved: {silver_table_full_name}")

StatementMeta(, 91006013-b9e9-409b-92cd-936490bae063, 57, Finished, Available, Finished)

Attempting to save df_silver_details to: silver.silver_transaction_details
Successfully saved: silver.silver_transaction_details
