In [0]:
# ============================================================================
# IMPORT REQUIRED LIBRARIES
# ============================================================================
# PySpark DataFrame transformations and Delta Lake merge operations
from pyspark.sql import functions as F
from delta.tables import DeltaTable

# ============================================================================
# LIBRARY USAGE:
# - F (functions): String/date manipulation, null handling, aggregations
# - DeltaTable: MERGE operations for upsert logic (update/insert existing/new records)
# ============================================================================

In [0]:
# ============================================================================
# IMPORT SHARED CONFIGURATION
# ============================================================================
# This command imports schema names from the setup utilities notebook
# Enables centralized configuration management
# %run /Workspace/Project1/1_setup_catalog/utilities
# 
# After running, these variables are available:
# - bronze_schema = "bronze"
# - silver_schema = "silver"  
# - gold_schema = "gold"
# ============================================================================

%run /Workspace/Project1/1_setup_catalog/utilities

In [0]:
# ============================================================================
# VERIFICATION: Print imported schema names
# ============================================================================
# Confirms that utilities were successfully imported
# Expected output: bronze silver gold

print(bronze_schema, silver_schema, gold_schema)

bronze silver gold


In [0]:
# ============================================================================
# CONFIGURE DATABRICKS WIDGETS (UI Parameters)
# ============================================================================
# These widgets allow notebook parameterization for different data sources
# Users can select values from the Databricks UI before running notebook
#
# Parameters:
# - catalog: Target catalog name (default: "fmcg")
# - data_source: Data source name (default: "customers")
#                Used to construct table names and file paths
# ============================================================================

dbutils.widgets.text("catalog", "fmcg", "Catalog")
dbutils.widgets.text("data_source", "customers", "Data Source")

In [0]:
# ============================================================================
# RETRIEVE WIDGET VALUES
# ============================================================================
# Get values from user inputs (or use defaults if not provided)

catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

# Display values for verification
print(catalog, data_source)

fmcg customers


In [0]:
# ============================================================================
# CONFIGURE AZURE ADLS Gen2 PATHS
# ============================================================================
# Define file paths in cloud storage for data ingestion
#
# Path Structure: abfss://container@storage_account.dfs.core.windows.net/path
# Components:
# - container: "conatiner-de-practice" (storage container name)
# - storage_account: "adlsgen2narayan" (storage account name)
# - path: Subdirectory and files to process
# ============================================================================

base_path = (
       f"abfss://conatiner-de-practice@adlsgen2narayan.dfs.core.windows.net/"
       f"{data_source}/*.csv"
   )

## üü† BRONZE LAYER - Raw Data Ingestion

**Purpose:** Load raw customer data from ADLS with minimal transformation  
**Update Pattern:** Append new records  
**Key Characteristics:** Full lineage tracking with metadata columns

### What Happens Here:
1. Read CSV files from landing directory
2. Add metadata (timestamp, file name, file size)  
3. Save to `fmcg.bronze.customers` with Change Data Feed enabled

In [0]:
# ============================================================================
# BRONZE LAYER: READ RAW CUSTOMER DATA
# ============================================================================
# Load CSV files from ADLS with automatic schema inference
#
# Steps:
# 1. Read CSV files using Spark DataFrame API
# 2. Set header=True to use first row as column names
# 3. Set inferSchema=True to auto-detect column data types
# 4. Add read_timestamp for audit trail
# 5. Include file metadata (_metadata.file_name, _metadata.file_size)
# ============================================================================

df = (
    spark.read.format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load(base_path)
    # Add timestamp when data was read
    .withColumn("read_timestamp", F.current_timestamp())
    # Include file metadata for lineage
    .select("*", "_metadata.file_name", "_metadata.file_size")
)

# Preview first 10 rows
display(df.limit(10))

customer_id,customer_name,city,read_timestamp,file_name,file_size
789201,FitFuel Market,Bengaluru,2025-11-30T10:46:29.541Z,customers.csv,1404
789202,FitFuel Market,Hyderabad,2025-11-30T10:46:29.541Z,customers.csv,1404
789203,FitFuel Market,New Delhi,2025-11-30T10:46:29.541Z,customers.csv,1404
789301,Athlete's Choice Store,Bengaluru,2025-11-30T10:46:29.541Z,customers.csv,1404
789303,Athlete's Choice Store,New Delhi,2025-11-30T10:46:29.541Z,customers.csv,1404
789101,Endurance Foods,Bengalore,2025-11-30T10:46:29.541Z,customers.csv,1404
789102,Endurance Foods,Hyderabad,2025-11-30T10:46:29.541Z,customers.csv,1404
789103,Endurance Foods,New Delhi,2025-11-30T10:46:29.541Z,customers.csv,1404
789121,HydroBoost Nutrition,Hyderabad,2025-11-30T10:46:29.541Z,customers.csv,1404
789122,HydroBoost Nutrition,New Delhi,2025-11-30T10:46:29.541Z,customers.csv,1404


In [0]:
# ============================================================================
# BRONZE LAYER: WRITE TO DELTA TABLE
# ============================================================================
# Save raw data to fmcg.bronze.customers for audit trail
#
# Configuration:
# - format: delta (ACID-compliant, supports time travel)
# - enableChangeDataFeed: true (track row-level changes)
# - mode: overwrite (replace table on re-runs)
# ============================================================================

df.write\
    .format("delta")\
    .option("delta.enableChangeDataFeed", "true")\
    .mode("overwrite")\
    .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = false)
 |-- file_name: string (nullable = false)
 |-- file_size: long (nullable = false)



In [0]:
# ============================================================================
# VERIFICATION: Display table schema
# ============================================================================
# Review data types and column names for correctness
# Helps identify any schema inference issues

df.printSchema()

## üü° SILVER LAYER - Data Cleaning & Standardization

**Purpose:** Create single source of truth with clean, deduplicated, standardized data  
**Update Pattern:** Merge with change tracking  
**Key Operations:**
- ‚úÖ Deduplication by customer_id
- ‚úÖ Text cleaning (trim whitespace)
- ‚úÖ City name standardization
- ‚úÖ Business rule application
- ‚úÖ Proper case formatting

### Data Quality Steps:
1. Load data from Bronze layer
2. Remove duplicate customers
3. Clean text fields
4. Standardize city names
5. Apply business team corrections
6. Format for analytics use

In [0]:
# ============================================================================
# SILVER LAYER: LOAD DATA FROM BRONZE
# ============================================================================
# Read the raw customer data from bronze layer for transformation

bronze_query = f"SELECT * FROM {catalog}.{bronze_schema}.{data_source};"
df_bronze = spark.sql(bronze_query)

# Display first 10 rows for verification
display(df_bronze.limit(10))

customer_id,customer_name,city,read_timestamp,file_name,file_size
789201,FitFuel Market,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404
789202,FitFuel Market,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404
789203,FitFuel Market,New Delhi,2025-11-30T11:09:11.602Z,customers.csv,1404
789301,Athlete's Choice Store,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404
789303,Athlete's Choice Store,New Delhi,2025-11-30T11:09:11.602Z,customers.csv,1404
789101,Endurance Foods,Bengalore,2025-11-30T11:09:11.602Z,customers.csv,1404
789102,Endurance Foods,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404
789103,Endurance Foods,New Delhi,2025-11-30T11:09:11.602Z,customers.csv,1404
789121,HydroBoost Nutrition,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404
789122,HydroBoost Nutrition,New Delhi,2025-11-30T11:09:11.602Z,customers.csv,1404


In [0]:
# ============================================================================
# DATA QUALITY STEP 1: FIND AND REMOVE DUPLICATE CUSTOMERS
# ============================================================================
# Business Rule: Each customer should appear only once
# Approach: Keep first occurrence, drop duplicates
#
# Steps:
# 1. First, identify duplicate customers for audit
# 2. Then, drop duplicates keeping first occurrence
# ============================================================================

# Find duplicate records for investigation
df_duplicates  = df_bronze.groupBy("customer_id").count().filter(F.col("count") > 1)
display(df_duplicates)

# Remove duplicate records (keep first occurrence)
print("duplicate count before delete: ", df_bronze.count())
df_silver = df_bronze.dropDuplicates(['customer_id'])
print("duplicate count after delete: ", df_silver.count())

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = true)
 |-- file_name: string (nullable = true)
 |-- file_size: long (nullable = true)



In [0]:
# ============================================================================
# DATA QUALITY STEP 2: IDENTIFY LEADING/TRAILING WHITESPACE IN NAMES
# ============================================================================
# Problem: Some customer names may have extra spaces
# Solution: Trim whitespace from both ends
#
# This query finds records where trimmed name differs from original
# ============================================================================

# Verify spaces from values
display(
    df_silver.filter(F.col("customer_name") != F.trim(F.col("customer_name")))
)

# Apply trimming to customer_name
df_silver = df_silver.withColumn("customer_name",
                                 F.trim(F.col("customer_name"))
                                 )

# Cross verify spaces have been removed
display(
    df_silver.filter(F.col("customer_name") != F.trim(F.col("customer_name")))
)

customer_id,count
789321,2
789503,2
789522,2
789603,2


In [0]:
# ============================================================================
# DATA QUALITY STEP 3: IDENTIFY UNIQUE CITIES AND MISSPELLINGS
# ============================================================================
# Problem: Different spellings/formats for same city (e.g., "Bengaluruu", "Bengalore")
# Solution: Create mapping dictionary to standardize city names
#
# This shows all unique city values - look for typos and variations
# ============================================================================

df_silver.select('city').distinct().show()

duplicate count before delete:  39
duplicate count after delete:  35


In [0]:
# ============================================================================
# DATA QUALITY STEP 4: CREATE CITY STANDARDIZATION MAPPING
# ============================================================================
# Purpose: Correct common misspellings and format variations
#
# Mapping Rules:
# - Bengaluruu, Bengalore ‚Üí Bengaluru
# - Hyderabadd, Hyderabad (with space) ‚Üí Hyderabad
# - NewDelhi (no space), NewDheli, etc. ‚Üí New Delhi
#
# Allowed Cities: ['Bengaluru', 'Hyderabad', 'New Delhi']
# Any other value becomes NULL for manual review
# ============================================================================

city_mapping = {
    'Bengaluruu': 'Bengaluru',
    'Bengalore': 'Bengaluru',

    'Hyderabadd': 'Hyderabad',
    'Hyderabad ': 'Hyderabad',

    'NewDelhi': 'New Delhi',
    'NewDheli ': 'New Delhi',
    'NewDelhee ': 'New Delhi'
}

allowed_cities = ['Bengaluru', 'Hyderabad', 'New Delhi']

df_silver = (
    df_silver
    # Apply mapping replacements
    .replace(city_mapping, subset= ['city'])
    # Keep only allowed cities (others become NULL)
    .withColumn(
        "city",
        F.when(F.col("city").isNull(), None)
        .when(F.col("city").isin(allowed_cities), F.col("city"))
        .otherwise(None))
)

# Verify standardization was successful
df_silver.select('city').distinct().show()

customer_id,customer_name,city,read_timestamp,file_name,file_size
789121,HydroBoost Nutrition,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404
789401,SprintX nutrition,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404
789420,ZenAthlete foods,,2025-11-30T11:09:11.602Z,customers.csv,1404
789421,ZenAthlete Foods,Hyderbad,2025-11-30T11:09:11.602Z,customers.csv,1404
789521,PrimeFuel Nutrition,,2025-11-30T11:09:11.602Z,customers.csv,1404
789702,StaminaX Store,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404


In [0]:
# ============================================================================
# DATA QUALITY STEP 5: STANDARDIZE CUSTOMER NAME CASING
# ============================================================================
# Problem: Names may be all caps, all lowercase, or mixed
# Solution: Apply "initcap" (proper case) - capitalize first letter of each word
#
# Example: "JOHN SMITH" ‚Üí "John Smith", "john smith" ‚Üí "John Smith"
# ============================================================================

# Review distinct customer names
df_silver.select('customer_name').distinct().show()

# Apply proper case formatting
df_silver = df_silver.withColumn(
    "customer_name",
    F.when(F.col("customer_name").isNull(), None)
    .otherwise(F.initcap("customer_name"))
)

# Verify formatting was applied
df_silver.select('customer_name').distinct().show()

In [0]:
# ============================================================================
# DATA QUALITY STEP 6: IDENTIFY NULL CITIES FOR BUSINESS CORRECTION
# ============================================================================
# Problem: Some cities failed standardization (not in allowed list or were NULL)
# Action: Work with business team to identify correct cities for these records
#
# This shows which customers have unknown/invalid cities
# ============================================================================

# Show all customers with null cities
df_silver.filter(F.col("city").isNull()).show(truncate = False)

# Show unique customer names with null cities
df_silver.filter(F.col("city").isNull()).select("customer_name").distinct().show(truncate = False)

customer_id,customer_name,city,read_timestamp,file_name,file_size


In [0]:
# ============================================================================
# DATA QUALITY STEP 7: APPLY BUSINESS TEAM-APPROVED CITY CORRECTIONS
# ============================================================================
# Note: These corrections were validated by the business team
# 
# Mappings by Customer:
# - Sprintx Nutrition (ID: 789403) ‚Üí New Delhi
# - Zenathlete Foods (IDs: 789420, 789421) ‚Üí Bengaluru, New Delhi
# - Primefuel Nutrition (IDs: 789521, 789522) ‚Üí Hyderabad, New Delhi
# - Recovery Lane (ID: 789603) ‚Üí Hyderabad
# ============================================================================

# Identify customers needing business corrections
null_customer_names  =["Sprintx Nutrition", "Zenathlete Foods", "Primefuel Nutrition", "Recovery Lane"]
df_silver.filter(F.col("customer_name").isin(null_customer_names)).show(truncate = False)

# Create correction lookup table
customer_city_fix = {
    # Sprintx Nutrition
    789403: "New Delhi",

    # Zenathlete Foods
    789420: "Bengaluru",

    # Primefuel Nutrition
    789521: "Hyderabad",

    # Recovery Lane
    789603: "Hyderabad",

    # Zenathlete Foods (additional locations)
    789421: "New Delhi",
    789422: "Hyderabad",

    # Primefuel Nutrition (additional locations)
    789522: "New Delhi"
}

# Create DataFrame from corrections
df_fix = spark.createDataFrame(
    [(k, v) for k, v in customer_city_fix.items()],
    ["customer_id", "fixed_city"]
)

display(df_fix)

+----------+
|      city|
+----------+
| Bengaluru|
| Hyderabad|
| New Delhi|
| Bengalore|
|Hyderabadd|
|      NULL|
|  Hyderbad|
| NewDelhee|
|  NewDelhi|
|Bengaluruu|
|  NewDheli|
+----------+



In [0]:
# ============================================================================
# DATA QUALITY STEP 8: APPLY BUSINESS CORRECTIONS
# ============================================================================
# Join correction table and use fixed values where city was NULL
# 
# Logic:
# - If city is already populated, keep it
# - If city is NULL, use fixed_city from correction table
# - Drop the temporary fixed_city column
# ============================================================================

df_silver = (
    df_silver
    # Left join with corrections (only affects NULL cities)
    .join(df_fix, "customer_id", "left")
    # Use coalesce to pick first non-NULL value
    .withColumn(
        "city",
        F.coalesce(F.col("city"), F.col("fixed_city"))
    )
    # Remove temporary correction column
    .drop("fixed_city")
)

display(df_silver)

+---------+
|     city|
+---------+
|Bengaluru|
|Hyderabad|
|New Delhi|
|     NULL|
+---------+



In [0]:
# ============================================================================
# VALIDATION: CONFIRM ALL NULL CITIES HAVE BEEN RESOLVED
# ============================================================================
# Final check to ensure no customers are missing city information

df_silver.filter(F.col("city").isNull()).select("customer_name").distinct().show(truncate = False)

+--------------------+
|       customer_name|
+--------------------+
|      FitFuel Market|
|Athlete's Choice ...|
|     Endurance Foods|
|HydroBoost Nutrition|
|MacroBite Superfoods|
|MacroBite superfoods|
|      PowerSnack Hub|
|      PowerSnack hub|
|   SprintX nutrition|
|   SprintX Nutrition|
|    ZenAthlete foods|
|    ZenAthlete Foods|
|Peak performance ...|
|Peak Performance ...|
| PrimeFuel Nutrition|
|       Recovery Lane|
|      StaminaX Store|
|EliteAthlete Nutr...|
|      GamePlan Foods|
|   Champion's choice|
+--------------------+
only showing top 20 rows


In [0]:
# ============================================================================
# DATA QUALITY STEP 9: CAST DATA TYPES
# ============================================================================
# Ensure consistent data types for downstream processing
# customer_id: Cast to string for consistent handling in SQL joins

df_silver = df_silver.withColumn("customer_id", F.col("customer_id").cast("string"))
print(df_silver.printSchema())

+--------------------+
|       customer_name|
+--------------------+
|      Fitfuel Market|
|Athlete's Choice ...|
|     Endurance Foods|
|Hydroboost Nutrition|
|Macrobite Superfoods|
|      Powersnack Hub|
|   Sprintx Nutrition|
|    Zenathlete Foods|
|Peak Performance ...|
| Primefuel Nutrition|
|       Recovery Lane|
|      Staminax Store|
|Eliteathlete Nutr...|
|      Gameplan Foods|
|   Champion's Choice|
+--------------------+



In [0]:
# ============================================================================
# DATA QUALITY STEP 10: CREATE BUSINESS ATTRIBUTES
# ============================================================================
# Add composite keys and static business attributes required by analytics
#
# New Columns:
# - customer: Composite key "CustomerName-City" for reference
# - market: All customers from India
# - platform: All from SportsBar platform
# - channel: Acquisition channel
# ============================================================================

df_silver = (
    df_silver
    # build final customer column: "CustomerName-City" or "CustomerName-Unknown"
    .withColumn(
        "customer",
        F.concat_ws("-", F.col("customer_name"), F.coalesce(F.col("city"), F.lit("Unknown")))
    )
    # Static attributes aligned with parent data model
    .withColumn("market", F.lit("India"))
    .withColumn("platform", F.lit("Sprorts Bar"))
    .withColumn("channel", F.lit("Acquistion"))
)

display(df_silver.limit(10))

+-----------+-------------------+----+--------------------------+-------------+---------+
|customer_id|customer_name      |city|read_timestamp            |file_name    |file_size|
+-----------+-------------------+----+--------------------------+-------------+---------+
|789403     |Sprintx Nutrition  |NULL|2025-11-30 11:09:11.602043|customers.csv|1404     |
|789420     |Zenathlete Foods   |NULL|2025-11-30 11:09:11.602043|customers.csv|1404     |
|789421     |Zenathlete Foods   |NULL|2025-11-30 11:09:11.602043|customers.csv|1404     |
|789422     |Zenathlete Foods   |NULL|2025-11-30 11:09:11.602043|customers.csv|1404     |
|789521     |Primefuel Nutrition|NULL|2025-11-30 11:09:11.602043|customers.csv|1404     |
|789522     |Primefuel Nutrition|NULL|2025-11-30 11:09:11.602043|customers.csv|1404     |
|789603     |Recovery Lane      |NULL|2025-11-30 11:09:11.602043|customers.csv|1404     |
+-----------+-------------------+----+--------------------------+-------------+---------+



In [0]:
# ============================================================================
# SILVER LAYER: WRITE TRANSFORMED DATA TO DELTA TABLE
# ============================================================================
# Save cleaned customer data to fmcg.silver.customers
#
# Configuration:
# - format: delta (ACID-compliant storage format)
# - enableChangeDataFeed: true (track row-level changes for audit)
# - mergeSchema: true (allow schema additions in future runs)
# - mode: overwrite (replace table on re-runs)
# ============================================================================

df_silver.write\
 .format("delta") \
 .option("delta.enableChangeDataFeed", "true") \
 .option("mergeSchema", "true") \
 .mode("overwrite") \
 .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

+-------------------+
|customer_name      |
+-------------------+
|Sprintx Nutrition  |
|Zenathlete Foods   |
|Primefuel Nutrition|
|Recovery Lane      |
+-------------------+



In [0]:
# ============================================================================
# GOLD LAYER: ANALYTICS-READY CUSTOMER DIMENSION
# ============================================================================
# Select essential columns and create staging table for final merge
# 
# Purpose: 
# - Prepare data for merge with parent company customer master
# - Include only business-ready columns
# - Maintain referential integrity with composite key
# ============================================================================

+-----------+-------------------+---------+--------------------------+-------------+---------+
|customer_id|customer_name      |city     |read_timestamp            |file_name    |file_size|
+-----------+-------------------+---------+--------------------------+-------------+---------+
|789401     |Sprintx Nutrition  |Bengaluru|2025-11-30 11:09:11.602043|customers.csv|1404     |
|789402     |Sprintx Nutrition  |Hyderabad|2025-11-30 11:09:11.602043|customers.csv|1404     |
|789403     |Sprintx Nutrition  |NULL     |2025-11-30 11:09:11.602043|customers.csv|1404     |
|789420     |Zenathlete Foods   |NULL     |2025-11-30 11:09:11.602043|customers.csv|1404     |
|789421     |Zenathlete Foods   |NULL     |2025-11-30 11:09:11.602043|customers.csv|1404     |
|789422     |Zenathlete Foods   |NULL     |2025-11-30 11:09:11.602043|customers.csv|1404     |
|789520     |Primefuel Nutrition|Bengaluru|2025-11-30 11:09:11.602043|customers.csv|1404     |
|789521     |Primefuel Nutrition|NULL     |2025-11

In [0]:
# ============================================================================
# GOLD LAYER: SELECT AND STRUCTURE DATA FOR ANALYTICS
# ============================================================================
# Read cleaned Silver layer data and select essential columns
# Select columns in order of business importance

silver_query = f"SELECT * FROM {catalog}.{silver_schema}.{data_source};"
df_silver = spark.sql(silver_query)

# Select only required columns for Gold layer
# Note: There's a typo in original code "patform" should be "platform"
df_gold = df_silver.select(
    "customer_id", 
    "customer_name", 
    "city", 
    "customer",       # Composite key: "CustomerName-City"
    "market", 
    "patform",        # ‚ö†Ô∏è NOTE: Should be "platform" - fix in future version
    "channel"
)

display(df_gold.limit(10))

customer_id,fixed_city
789403,New Delhi
789420,Bengaluru
789521,Hyderabad
789603,Hyderabad
789421,New Delhi
789422,Hyderabad
789522,New Delhi


In [0]:
# ============================================================================
# GOLD LAYER: CREATE STAGING TABLE
# ============================================================================
# Save SportsBar-specific customer dimension to intermediate table
# This table will be merged with parent company master data
#
# Table: fmcg.gold.sb_dim_customers (SportsBar dimension customers)
# ============================================================================

df_gold.write\
 .format("delta") \
 .option("delta.enableChangeDataFeed", "true") \
 .option("mergeSchema", "true") \
 .mode("overwrite") \
 .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

customer_id,customer_name,city,read_timestamp,file_name,file_size
789101,Endurance Foods,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404
789102,Endurance Foods,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404
789103,Endurance Foods,New Delhi,2025-11-30T11:09:11.602Z,customers.csv,1404
789121,Hydroboost Nutrition,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404
789122,Hydroboost Nutrition,New Delhi,2025-11-30T11:09:11.602Z,customers.csv,1404
789201,Fitfuel Market,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404
789202,Fitfuel Market,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404
789203,Fitfuel Market,New Delhi,2025-11-30T11:09:11.602Z,customers.csv,1404
789220,Macrobite Superfoods,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404
789221,Macrobite Superfoods,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404


In [0]:
# ============================================================================
# MERGE WITH PARENT COMPANY MASTER DATA
# ============================================================================
# Integrate SportsBar customers into parent company's dim_customers table
# 
# Purpose:
# - Single source of truth across all brands
# - Unified customer dimension for group-level analytics
# - Track which customers belong to which brands
# ============================================================================

+-------------+
|customer_name|
+-------------+
+-------------+



In [0]:
# ============================================================================
# DELTA MERGE OPERATION: UPSERT INTO PARENT CUSTOMER DIMENSION
# ============================================================================
# Use Delta Lake MERGE for efficient insert/update operations
#
# Merge Strategy:
# - Target: fmcg.gold.dim_customers (parent customer master)
# - Source: fmcg.gold.sb_dim_customers (SportsBar customers)
# - Match Condition: customer_code (unique customer identifier)
# 
# Actions:
# - whenMatched: Update all columns for existing customers
# - whenNotMatched: Insert new customers
# ============================================================================

# Load parent company customer dimension
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_customers")

# Prepare SportsBar customers for merge (rename columns to match parent schema)
df_child_customers = spark.table("fmcg.gold.sb_dim_customers").select(
    F.col("customer_id").alias("customer_code"),
    "customer",
    "market",
    F.col("patform").alias("platform"),  # ‚ö†Ô∏è Fix typo here
    "channel"
)

display(df_child_customers)

# Execute MERGE operation
delta_table.alias("target").merge(
    source=df_child_customers.alias("source"),
    condition="target.customer_code = source.customer_code"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

root
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = true)
 |-- file_name: string (nullable = true)
 |-- file_size: long (nullable = true)

None


In [0]:
# ============================================================================
# VERIFICATION: DISPLAY FINAL MERGED DATA
# ============================================================================
# Review the parent customer dimension after merge
# Should contain both existing and newly merged SportsBar customers

gold_query = f"SELECT * FROM {catalog}.{gold_schema}.dim_{data_source};"
df_gold = spark.sql(gold_query)
display(df_gold)

customer_id,customer_name,city,read_timestamp,file_name,file_size,customer,market,patform,channel
789503,Peak Performance Store,New Delhi,2025-11-30T11:09:11.602Z,customers.csv,1404,Peak Performance Store-New Delhi,India,Sprorts Bar,Acquistion
789420,Zenathlete Foods,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404,Zenathlete Foods-Bengaluru,India,Sprorts Bar,Acquistion
789703,Staminax Store,New Delhi,2025-11-30T11:09:11.602Z,customers.csv,1404,Staminax Store-New Delhi,India,Sprorts Bar,Acquistion
789621,Eliteathlete Nutrition,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404,Eliteathlete Nutrition-Hyderabad,India,Sprorts Bar,Acquistion
789101,Endurance Foods,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404,Endurance Foods-Bengaluru,India,Sprorts Bar,Acquistion
789220,Macrobite Superfoods,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404,Macrobite Superfoods-Bengaluru,India,Sprorts Bar,Acquistion
789720,Gameplan Foods,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404,Gameplan Foods-Bengaluru,India,Sprorts Bar,Acquistion
789601,Recovery Lane,Bengaluru,2025-11-30T11:09:11.602Z,customers.csv,1404,Recovery Lane-Bengaluru,India,Sprorts Bar,Acquistion
789122,Hydroboost Nutrition,New Delhi,2025-11-30T11:09:11.602Z,customers.csv,1404,Hydroboost Nutrition-New Delhi,India,Sprorts Bar,Acquistion
789402,Sprintx Nutrition,Hyderabad,2025-11-30T11:09:11.602Z,customers.csv,1404,Sprintx Nutrition-Hyderabad,India,Sprorts Bar,Acquistion


In [0]:
# Write the transformed customer data into Silver layer

df_silver.write\
 .format("delta") \
 .option("delta.enableChangeDataFeed", "true") \
 .option("mergeSchema", "true") \
 .mode("overwrite") \
 .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

## GOLD Layer Data Processing

In [0]:
silver_query = f"SELECT * FROM {catalog}.{silver_schema}.{data_source};"
df_silver = spark.sql(silver_query)
# display(df_silver.limit(10))
# take req cols only
# "customer_id, customer_name, city, read_timestamp, file_name, file_size, customer, market, platform, channel"
df_gold = df_silver.select("customer_id", "customer_name", "city", "customer", "market", "patform", "channel")
# df_silver.select("customer_id", "customer_name", "city", "customer", "market", "patform", "channel").show(10)

In [0]:
display(df_gold.limit(10))

customer_id,customer_name,city,customer,market,patform,channel
789503,Peak Performance Store,New Delhi,Peak Performance Store-New Delhi,India,Sprorts Bar,Acquistion
789420,Zenathlete Foods,Bengaluru,Zenathlete Foods-Bengaluru,India,Sprorts Bar,Acquistion
789703,Staminax Store,New Delhi,Staminax Store-New Delhi,India,Sprorts Bar,Acquistion
789621,Eliteathlete Nutrition,Hyderabad,Eliteathlete Nutrition-Hyderabad,India,Sprorts Bar,Acquistion
789101,Endurance Foods,Bengaluru,Endurance Foods-Bengaluru,India,Sprorts Bar,Acquistion
789220,Macrobite Superfoods,Bengaluru,Macrobite Superfoods-Bengaluru,India,Sprorts Bar,Acquistion
789720,Gameplan Foods,Bengaluru,Gameplan Foods-Bengaluru,India,Sprorts Bar,Acquistion
789601,Recovery Lane,Bengaluru,Recovery Lane-Bengaluru,India,Sprorts Bar,Acquistion
789122,Hydroboost Nutrition,New Delhi,Hydroboost Nutrition-New Delhi,India,Sprorts Bar,Acquistion
789402,Sprintx Nutrition,Hyderabad,Sprintx Nutrition-Hyderabad,India,Sprorts Bar,Acquistion


In [0]:
# create a stage table in gold layer

df_gold.write\
 .format("delta") \
 .option("delta.enableChangeDataFeed", "true") \
 .option("mergeSchema", "true") \
 .mode("overwrite") \
 .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

### Merging Data source with parent

In [0]:
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_customers")
df_child_customers = spark.table("fmcg.gold.sb_dim_customers").select(
    F.col("customer_id").alias("customer_code"),
    "customer",
    "market",
    F.col("patform").alias("platform"),
    "channel"
)

In [0]:
display(df_child_customers)

customer_code,customer,market,platform,channel
789503,Peak Performance Store-New Delhi,India,Sprorts Bar,Acquistion
789420,Zenathlete Foods-Bengaluru,India,Sprorts Bar,Acquistion
789703,Staminax Store-New Delhi,India,Sprorts Bar,Acquistion
789621,Eliteathlete Nutrition-Hyderabad,India,Sprorts Bar,Acquistion
789101,Endurance Foods-Bengaluru,India,Sprorts Bar,Acquistion
789220,Macrobite Superfoods-Bengaluru,India,Sprorts Bar,Acquistion
789720,Gameplan Foods-Bengaluru,India,Sprorts Bar,Acquistion
789601,Recovery Lane-Bengaluru,India,Sprorts Bar,Acquistion
789122,Hydroboost Nutrition-New Delhi,India,Sprorts Bar,Acquistion
789402,Sprintx Nutrition-Hyderabad,India,Sprorts Bar,Acquistion


In [0]:
delta_table.alias("target").merge(
    source=df_child_customers.alias("source"),
    condition="target.customer_code = source.customer_code"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
gold_query = f"SELECT * FROM {catalog}.{gold_schema}.dim_{data_source};"
df_gold = spark.sql(gold_query)
display(df_gold)

customer_code,customer,market,platform,channel
789503,Peak Performance Store-New Delhi,India,Sprorts Bar,Acquistion
789420,Zenathlete Foods-Bengaluru,India,Sprorts Bar,Acquistion
789703,Staminax Store-New Delhi,India,Sprorts Bar,Acquistion
789621,Eliteathlete Nutrition-Hyderabad,India,Sprorts Bar,Acquistion
789101,Endurance Foods-Bengaluru,India,Sprorts Bar,Acquistion
789220,Macrobite Superfoods-Bengaluru,India,Sprorts Bar,Acquistion
789720,Gameplan Foods-Bengaluru,India,Sprorts Bar,Acquistion
789601,Recovery Lane-Bengaluru,India,Sprorts Bar,Acquistion
789122,Hydroboost Nutrition-New Delhi,India,Sprorts Bar,Acquistion
789402,Sprintx Nutrition-Hyderabad,India,Sprorts Bar,Acquistion
