In [0]:
import dlt
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# -------------------------------
# Config
# -------------------------------
CATALOG = "dev_aoc_catalog"
SILVER_SCHEMA = "silver_google_analytics"
GOLD_SCHEMA = "gold_google_analytics"
SILVER_TABLES = f"{CATALOG}.{SILVER_SCHEMA}"

# -------------------------------
# Gold Table - Dimension Users (Streaming)
# -------------------------------
@dlt.table(
    name="ga_dim_users_stream",
    comment="Gold table for GA users dimension - Streaming",
    partition_cols=["home_country"],
    table_properties={
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true",
        "quality": "gold"
    }
)
def ga_dim_users():
    """
    Streaming dimension table for users.
    Uses stream-stream deduplication.
    """
    # Step 1: Read Silver Table as stream
    silver_df = dlt.read_stream(f"{SILVER_TABLES}.ga_silver_web_users_stream")
    
    # Step 2: Convert timestamps and map columns
    df = silver_df.select(
        F.col("user_id"),
        F.col("pseudo_id"),
        F.to_date("first_seen_time").alias("first_seen_date"),
        F.col("identified_user_flg").alias("known_user_flag"),
        F.col("home_country"),
        F.col("home_city"),
        F.col("preferred_device"),
        F.col("first_touch_medium"),
        F.col("first_touch_source"),
        F.col("first_touch_campaign"),
        F.col("user_property_name"),
        F.col("user_property_value"),
        F.current_timestamp().alias("upsert_dttm")
    )
    
    # Step 3: Generate surrogate key using hash + UUID for uniqueness
    final_df = (
        df
        .withColumn("unique_uuid", F.expr("uuid()"))
        .withColumn("user_sk", 
                    F.abs(F.hash(
                        F.col("user_id"),
                        F.col("pseudo_id"),
                        F.col("first_seen_date"),
                        F.col("upsert_dttm"),
                        F.col("unique_uuid")
                    )).cast("bigint"))
        .drop("unique_uuid")
    )
    
    # Step 4: Select final columns in gold order
    return final_df.select(
        "user_sk",
        "user_id",
        "pseudo_id",
        "first_seen_date",
        "known_user_flag",
        "home_country",
        "home_city",
        "preferred_device",
        "first_touch_medium",
        "first_touch_source",
        "first_touch_campaign",
        "user_property_name",
        "user_property_value",
        "upsert_dttm"
    )


# -------------------------------
# Gold Table - Fact Sessions (Streaming)
# -------------------------------
@dlt.table(
    name="ga_fct_sessions_stream",
    comment="Gold fact table for GA sessions - Streaming",
    partition_cols=["session_start_dt"],
    table_properties={
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true",
        "quality": "gold"
    }
)
def ga_fct_sessions():
    """
    Streaming fact table for sessions.
    Uses stream-static join with dimension table.
    """
    # Step 1: Read Silver Sessions as stream
    silver_sessions_df = dlt.read_stream(f"{SILVER_TABLES}.ga_silver_web_sessions_stream")
    
    # Step 2: Read Gold Users as static (for stream-static join)
    users_df = dlt.read("ga_dim_users").select("user_sk", "user_id")
    
    # Step 3: Calculate session duration (assuming session_sequence is in milliseconds or count)
    session_prep_df = (
        silver_sessions_df
        .withColumn("session_duration_sec", (F.col("session_sequence") / 1000).cast("int"))
        .withColumn("session_start_dt", F.col("session_start_time").cast("date"))
        .withColumn("date_sk", F.date_format("session_start_dt", "yyyyMMdd").cast("int"))
        .withColumn("session_count", F.lit(1))
    )
    
    # Step 4: Stream-static join with users
    session_with_user_df = session_prep_df.join(users_df, on="user_id", how="left")
    
    # Step 5: Generate unique session_sk using hash + UUID
    final_df = (
        session_with_user_df
        .withColumn("unique_uuid", F.expr("uuid()"))
        .withColumn("fact_session_sk", 
                    F.abs(F.hash(
                        F.col("session_sk"),
                        F.col("session_id"),
                        F.col("user_sk"),
                        F.col("session_start_dt"),
                        F.col("unique_uuid")
                    )).cast("bigint"))
        .withColumn("insert_dttm", F.current_timestamp())
        .drop("unique_uuid")
    )
    
    # Step 6: Select final columns in gold order
    return final_df.select(
        F.col("fact_session_sk").alias("session_sk"),
        "session_id",
        "user_sk",
        "date_sk",
        "session_start_dt",
        "session_duration_sec",
        "is_engaged_session",
        F.col("traffic_source").alias("utm_source_last"),
        F.col("traffic_medium").alias("utm_medium_last"),
        F.col("traffic_campaign").alias("utm_campaign_last"),
        "device_category",
        "os",
        "browser",
        "session_count",
        "insert_dttm"
    )


# -------------------------------
# Gold Table - Fact Form Submissions (Streaming)
# -------------------------------
@dlt.table(
    name="ga_fct_submissions_stream",
    comment="Gold fact table for GA form submissions - Streaming",
    partition_cols=["form_submit_dt"],
    table_properties={
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true",
        "quality": "gold"
    }
)
def ga_fct_submissions():
    """
    Streaming fact table for form submissions.
    Uses stream-static joins with dimension and fact tables.
    """
    # Step 1: Read Silver Form Fills as stream
    form_fills_df = dlt.read_stream(f"{SILVER_TABLES}.ga_silver_form_fill_stream")
    
    # Step 2: Read Gold Users as static
    users_df = dlt.read("ga_dim_users").select("user_sk", "user_id")
    
    # Step 3: Read Gold Sessions as static
    sessions_df = dlt.read("ga_fct_sessions").select("session_sk", "session_id", "user_sk")
    
    # Step 4: Stream-static join with users
    df_with_user = form_fills_df.join(users_df, on="user_id", how="left")
    
    # Step 5: Stream-static join with sessions
    df_with_session = df_with_user.join(
        sessions_df,
        on=["session_id", "user_sk"],
        how="left"
    )
    
    # Step 6: Create date fields
    df_with_date = (
        df_with_session
        .withColumn("form_submit_dt", F.col("form_submit_time").cast("date"))
        .withColumn("date_sk", F.date_format(F.col("form_submit_dt"), "yyyyMMdd").cast("int"))
        .withColumn("form_submit_count", F.lit(1))
    )
    
    # Step 7: Generate unique fill_id using hash + UUID
    final_df = (
        df_with_date
        .withColumn("unique_uuid", F.expr("uuid()"))
        .withColumn("fill_id", 
                    F.abs(F.hash(
                        F.col("form_fill_sk"),
                        F.col("session_sk"),
                        F.col("user_sk"),
                        F.col("form_submit_dt"),
                        F.col("unique_uuid")
                    )).cast("bigint"))
        .withColumn("insert_dttm", F.current_timestamp())
        .drop("unique_uuid")
    )
    
    # Step 8: Select final columns
    return final_df.select(
        "fill_id",
        "session_sk",
        "user_sk",
        "date_sk",
        "form_submit_dt",
        "form_id",
        "form_name",
        "form_type",
        "form_page_url",
        "form_submit_count",
        "insert_dttm"
    )


# -------------------------------
# Gold Table - Fact Purchases (Streaming)
# -------------------------------
@dlt.table(
    name="ga_fct_purchases_stream",
    comment="Gold fact table for GA purchases - Streaming",
    partition_cols=["purchase_date"],
    table_properties={
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true",
        "quality": "gold"
    }
)
def ga_fct_purchases():
    """
    Streaming fact table for purchases.
    Uses stream-static joins.
    """
    # Step 1: Read Silver Purchases as stream
    purchases_df = dlt.read_stream(f"{SILVER_TABLES}.ga_silver_purchase_stream")
    
    # Step 2: Read Gold Users as static
    users_df = dlt.read("ga_dim_users").select("user_sk", "user_id")
    
    # Step 3: Read Gold Sessions as static
    sessions_df = dlt.read("ga_fct_sessions").select("session_sk", "session_id", "user_sk")
    
    # Step 4: Stream-static join with users
    df_with_user = purchases_df.join(users_df, on="user_id", how="left")
    
    # Step 5: Stream-static join with sessions
    df_with_session = df_with_user.join(
        sessions_df,
        on=["session_id", "user_sk"],
        how="left"
    )
    
    # Step 6: Create date fields
    df_with_date = (
        df_with_session
        .withColumn("purchase_date", F.col("purchase_date").cast("date"))
        .withColumn("date_sk", F.date_format(F.col("purchase_date"), "yyyyMMdd").cast("int"))
    )
    
    # Step 7: Generate unique purchase_id using hash + UUID
    final_df = (
        df_with_date
        .withColumn("unique_uuid", F.expr("uuid()"))
        .withColumn("purchase_id", 
                    F.abs(F.hash(
                        F.col("purchase_sk"),
                        F.col("transaction_id"),
                        F.col("session_sk"),
                        F.col("user_sk"),
                        F.col("purchase_date"),
                        F.col("unique_uuid")
                    )).cast("bigint"))
        .withColumn("insert_dttm", F.current_timestamp())
        .drop("unique_uuid")
    )
    
    # Step 8: Select final columns
    return final_df.select(
        "purchase_id",
        "transaction_id",
        "session_sk",
        "user_sk",
        "date_sk",
        "purchase_date",
        "product_id",
        "product_name",
        "item_variant",
        "quantity",
        "price",
        "insert_dttm"
    )


# -------------------------------
# Gold Table - Fact Page Views (Streaming)
# -------------------------------
@dlt.table(
    name="ga_fct_pageviews_stream",
    comment="Gold fact table for GA pageviews - Streaming",
    partition_cols=["page_view_dt"],
    table_properties={
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true",
        "quality": "gold"
    }
)
def ga_fct_pageviews():
    """
    Streaming fact table for page views.
    Uses stream-static joins.
    """
    # Step 1: Read Silver Page Views as stream
    pageviews_df = dlt.read_stream(f"{SILVER_TABLES}.ga_silver_page_views_stream")
    
    # Step 2: Read Gold Users as static
    users_df = dlt.read("ga_dim_users").select("user_sk", "user_id")
    
    # Step 3: Read Gold Sessions as static
    sessions_df = dlt.read("ga_fct_sessions").select("session_sk", "session_id", "user_sk")
    
    # Step 4: Stream-static join with users
    pageviews_with_user_df = pageviews_df.join(users_df, on="user_id", how="left")
    
    # Step 5: Stream-static join with sessions
    pageviews_with_session_df = pageviews_with_user_df.join(
        sessions_df,
        on=["session_id", "user_sk"],
        how="left"
    )
    
    # Step 6: Create date fields
    pageviews_with_date_df = (
        pageviews_with_session_df
        .withColumn("page_view_dt", F.col("view_timestamp").cast("date"))
        .withColumn("date_sk", F.date_format(F.col("page_view_dt"), "yyyyMMdd").cast("int"))
        .withColumn("view_count", F.lit(1))
        .withColumn("page_category", F.lit(None).cast("string"))
    )
    
    # Step 7: Generate unique page_view_id using hash + UUID
    final_df = (
        pageviews_with_date_df
        .withColumn("unique_uuid", F.expr("uuid()"))
        .withColumn("page_view_id", 
                    F.abs(F.hash(
                        F.col("page_view_sk"),
                        F.col("session_sk"),
                        F.col("user_sk"),
                        F.col("page_view_dt"),
                        F.col("unique_uuid")
                    )).cast("bigint"))
        .withColumn("insert_dttm", F.current_timestamp())
        .drop("unique_uuid")
    )
    
    # Step 8: Select final columns
    return final_df.select(
        "page_view_id",
        "session_sk",
        "user_sk",
        "date_sk",
        "page_view_dt",
        "page_url",
        "page_title",
        "page_category",
        "engagement_time",
        "view_count",
        "insert_dttm"
    )