## üìö Step 1: Import Required Libraries
We use PySpark SQL functions for date manipulation and column operations.

In [0]:
# ============================================================================
# IMPORT REQUIRED LIBRARIES
# ============================================================================
# PySpark SQL Functions - For date manipulation, column operations, and formatting
from pyspark.sql import functions as F

# ============================================================================
# LIBRARY EXPLANATION:
# - F.date_format() : Format dates to specific patterns (e.g., 'MMMM' for month name)
# - F.year(), F.quarter() : Extract date components
# - F.concat() : Combine columns into composite values
# - F.lit() : Create literal constant columns
# - F.sequence() : Generate sequence of values (dates in this case)
# - F.explode() : Transform array into individual rows
# ============================================================================

## ‚öôÔ∏è Step 2: Define Date Range Configuration

Define the start and end dates for the dimension table. Adjust these values based on:
- Historical data requirements
- Future projection needs
- Business reporting calendar

In [0]:
# ============================================================================
# DATE RANGE CONFIGURATION
# ============================================================================
# Define the time period for the dimension table
# Adjust start_date and end_date based on your data requirements

start_date = "2024-01-01"  # First month to include (inclusive)
end_date   = "2025-12-01"  # Last month to include (inclusive)

# ============================================================================
# CONFIGURATION GUIDANCE:
# - Start Date: Typically matches earliest transaction data
# - End Date: Set to cover full planning horizon (1-3 years ahead recommended)
# - Grain: Monthly (first day of each month)
# - To extend: Simply update these dates and re-run the notebook
# ============================================================================

In [0]:
# ============================================================================
# STEP 1: GENERATE MONTHLY DATE SEQUENCE
# ============================================================================
# Creates one row per month from start_date to end_date
# 
# Process Breakdown:
# 1. sequence() - Generates a sequence of dates from start to end (1 month intervals)
# 2. explode() - Transforms array of dates into individual rows (one per date)
# 3. select() - Selects and renames the result as 'month_start_date'
# ============================================================================

df = (
    spark.sql(f"""
        SELECT explode(
            sequence(
                to_date('{start_date}'),
                to_date('{end_date}'),
                interval 1 month      -- Generate one date per month
            )
        ) AS month_start_date
    """)
)

# ============================================================================
# EXPLANATION:
# - sequence() creates: [2024-01-01, 2024-02-01, 2024-03-01, ...]
# - explode() transforms: [{2024-01-01}, {2024-02-01}, {2024-03-01}, ...]
# - Result: DataFrame with one row per month in date range
# ============================================================================

In [0]:
# ============================================================================
# STEP 3: DISPLAY SAMPLE DATA FOR VERIFICATION
# ============================================================================
# Review the generated date dimension data to ensure correctness
# Look for:
# - Correct date sequence (increments by 1 month)
# - Proper date formatting
# - Expected year/quarter/month values

display(df)

month_start_date,date_key,year,month_name,month_short_name,quarter,year_quarter
2024-01-01,202401,2024,January,Jan,Q1,2024-Q1
2024-02-01,202402,2024,February,Feb,Q1,2024-Q1
2024-03-01,202403,2024,March,Mar,Q1,2024-Q1
2024-04-01,202404,2024,April,Apr,Q2,2024-Q2
2024-05-01,202405,2024,May,May,Q2,2024-Q2
2024-06-01,202406,2024,June,Jun,Q2,2024-Q2
2024-07-01,202407,2024,July,Jul,Q3,2024-Q3
2024-08-01,202408,2024,August,Aug,Q3,2024-Q3
2024-09-01,202409,2024,September,Sep,Q3,2024-Q3
2024-10-01,202410,2024,October,Oct,Q4,2024-Q4


In [0]:
# ============================================================================
# STEP 2: CREATE ANALYTICAL COLUMNS FOR TIME-BASED REPORTING
# ============================================================================
# Add derived columns for common time-based analyses
# These columns enable efficient filtering and grouping in SQL queries
#
# Column Definitions:
# - date_key: Integer surrogate key (yyyyMM format) for efficient joins
# - year: Calendar year for year-over-year comparisons
# - month_name: Full month name for readable reports
# - month_short_name: 3-letter month abbreviation for compact displays
# - quarter: Fiscal quarter (Q1, Q2, Q3, Q4) for quarterly reports
# - year_quarter: Combined year and quarter for sorting/filtering
# ============================================================================

df = (
    df
    # Surrogate key at month grain (e.g., 202401 for Jan 2024)
    .withColumn("date_key", F.date_format("month_start_date", "yyyyMM").cast("int"))
    
    # Extract year component for time series analysis
    .withColumn("year", F.year("month_start_date"))
    
    # Extract full month name (January, February, ...)
    .withColumn("month_name", F.date_format("month_start_date", "MMMM"))
    
    # Extract abbreviated month name (Jan, Feb, ...)
    .withColumn("month_short_name", F.date_format("month_start_date", "MMM"))
    
    # Extract fiscal quarter (Q1, Q2, Q3, Q4)
    .withColumn("quarter", F.concat(F.lit("Q"), F.quarter("month_start_date")))
    
    # Create year-quarter composite (2024-Q1, 2024-Q2, ...)
    .withColumn("year_quarter", F.concat(F.col("year"), F.lit("-Q"), F.quarter("month_start_date")))
)

# ============================================================================
# EXAMPLES OF DERIVED COLUMNS:
# month_start_date | date_key | year | month_name | quarter | year_quarter
# 2024-01-01       | 202401   | 2024 | January    | Q1      | 2024-Q1
# 2024-04-01       | 202404   | 2024 | April      | Q2      | 2024-Q2
# ============================================================================

In [0]:
# ============================================================================
# STEP 4: SAVE DATE DIMENSION TO DELTA TABLE (GOLD LAYER)
# ============================================================================
# Persist the dimension table for use in downstream fact table joins
#
# Configuration:
# - Format: Delta Lake (ACID-compliant, supports time travel)
# - Mode: overwrite (replace existing table on re-runs)
# - Table Name: fmcg.gold.dim_date
# - Catalog: fmcg (FMCG analytics catalog)
# - Schema: gold (Gold layer = business-ready analytics data)
# ============================================================================

df.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable("fmcg.gold.dim_date")

# ============================================================================
# POST-EXECUTION VERIFICATION:
# Run these SQL commands to verify the table was created successfully:
#
# SELECT * FROM fmcg.gold.dim_date LIMIT 10;
# SELECT COUNT(*) as total_months FROM fmcg.gold.dim_date;
# SELECT DISTINCT year FROM fmcg.gold.dim_date ORDER BY year;
# ============================================================================