In [0]:
%sql
SELECT *
FROM `dev_aoc_catalog`.`bronze_google_analytics`.`events_raw`
WHERE EXISTS (
    SELECT 1
    FROM LATERAL VIEW EXPLODE(`event_params`) AS ep
    
)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType, DateType
from pyspark.sql import functions as F
from datetime import datetime, timedelta
import random

# Initialize Spark Session
spark = SparkSession.builder.appName("SilverFormDummyData").getOrCreate()

# Extract real user_pseudo_id and ga_session_id from bronze table
print("=== Extracting real user IDs and session IDs from bronze ===")
bronze_table = "dev_aoc_catalog.bronze_google_analytics.events_raw"

real_users_sessions = spark.sql(f"""
    SELECT DISTINCT
        user_pseudo_id,
        element_at(filter(event_params, x -> x.key = 'ga_session_id'), 1).value.int_value as ga_session_id
    FROM {bronze_table}
    WHERE user_pseudo_id IS NOT NULL
        AND size(filter(event_params, x -> x.key = 'ga_session_id')) > 0
        AND element_at(filter(event_params, x -> x.key = 'ga_session_id'), 1).value.int_value IS NOT NULL
    LIMIT 50
""").collect()

print(f"✓ Found {len(real_users_sessions)} real user/session combinations")

if len(real_users_sessions) == 0:
    print("⚠ No real user/session data found. Using synthetic data instead.")
    real_users_sessions = [
        type('obj', (object,), {
            'user_pseudo_id': f"user_{random.randint(1000, 9999)}.{random.randint(1000000000, 9999999999)}",
            'ga_session_id': random.randint(1000000000, 9999999999)
        })() for _ in range(50)
    ]
else:
    print("\n=== Sample of real user/session data ===")
    for i, row in enumerate(real_users_sessions[:5]):
        print(f"  {i+1}. user_pseudo_id: {row.user_pseudo_id}, ga_session_id: {row.ga_session_id}")

# Define schema for silver table
schema = StructType([
    StructField("form_fill_sk", LongType(), False),
    StructField("form_date", DateType(), True),
    StructField("user_id", StringType(), True),
    StructField("session_id", StringType(), True),
    StructField("form_submit_time", TimestampType(), True),
    StructField("event_name", StringType(), True),
    StructField("form_id", StringType(), True),
    StructField("form_name", StringType(), True),
    StructField("form_page_url", StringType(), True),
    StructField("form_type", StringType(), True),
    StructField("insert_dttm", TimestampType(), True),
    StructField("ingestion_id", StringType(), True)
])

# Sample data values
form_ids = ["contact_form_001", "newsletter_signup_002", "quote_request_003", 
            "support_ticket_004", "registration_form_005"]
form_names = ["Contact Us", "Newsletter Signup", "Get a Quote", 
              "Support Request", "Account Registration"]
page_urls = [
    "https://example.com/contact",
    "https://example.com/",
    "https://example.com/services",
    "https://example.com/support",
    "https://example.com/register"
]
form_types = ["contact", "subscription", "quote", "support", "registration"]

# Generate dummy data using real user/session IDs
print("\n=== Generating dummy form submissions ===")
dummy_data = []
start_date = datetime(2025, 10, 1)
current_time = datetime.now()

for i in range(100):  # Generate 100 form submissions
    # Random date within last 24 days
    form_submit_datetime = start_date + timedelta(
        days=random.randint(0, 23),
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
    )
    
    # Select random form
    idx = random.randint(0, 4)
    
    # Use real user and session from bronze
    user_session = random.choice(real_users_sessions)
    user_id = user_session.user_pseudo_id
    session_id = str(user_session.ga_session_id)
    
    # Generate surrogate key
    form_fill_sk = random.randint(1000000000000000, 9999999999999999)
    
    # Generate UUID-like ingestion_id
    ingestion_id = f"{random.randint(10000000, 99999999)}-{random.randint(1000, 9999)}-{random.randint(1000, 9999)}-{random.randint(1000, 9999)}-{random.randint(100000000000, 999999999999)}"
    
    dummy_data.append({
        "form_fill_sk": form_fill_sk,
        "form_date": form_submit_datetime.date(),
        "user_id": user_id,
        "session_id": session_id,
        "form_submit_time": form_submit_datetime,
        "event_name": "form_submit",
        "form_id": form_ids[idx],
        "form_name": form_names[idx],
        "form_page_url": page_urls[idx],
        "form_type": form_types[idx],
        "insert_dttm": current_time,
        "ingestion_id": ingestion_id
    })

# Create DataFrame
df = spark.createDataFrame(dummy_data, schema=schema)

# Display sample data
print("\n=== Sample of dummy form fill data ===")
df.show(10, truncate=False)

print("\n=== Summary Statistics ===")
df.groupBy("form_type", "form_name").count().orderBy("form_type").show()

print("\n=== Unique Users and Sessions ===")
print(f"Unique users: {df.select('user_id').distinct().count()}")
print(f"Unique sessions: {df.select('session_id').distinct().count()}")

# Write to silver table
silver_table_name = "dev_aoc_catalog.silver.ga_silver_form_fill"

# Drop existing non-Delta table if it exists
spark.sql(f"DROP TABLE IF EXISTS {silver_table_name}")
print(f"\n✓ Dropped existing table: {silver_table_name}")

# Create new Delta table
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(silver_table_name)

print(f"✓ Created {df.count()} dummy form_submit events")
print(f"✓ Data written to: {silver_table_name}")

# Verification query
print("\n=== Recent Form Submissions ===")
spark.sql(f"""
    SELECT 
        form_date,
        form_name,
        form_type,
        user_id,
        session_id,
        form_submit_time
    FROM {silver_table_name}
    ORDER BY form_submit_time DESC
    LIMIT 10
""").show(truncate=False)

In [0]:

form_fills_df = spark.table(f"dev_aoc_catalog.silver.ga_silver_form_fill")
display(form_fills_df.limit(5))

# Check if users dimension has data
users_df = spark.table("dev_aoc_catalog.gold.ga_dim_users")
display(users_df.limit(5))

In [0]:
%sql
DESCRIBE FORMATTED dev_aoc_catalog.gold.ga_fct_submissions;

In [0]:
%sql
CREATE OR REPLACE TABLE dev_aoc_catalog.gold.ga_fct_submissions_dedup AS
SELECT *
FROM (
  SELECT *,
         ROW_NUMBER() OVER (PARTITION BY user_sk ORDER BY user_sk) AS rn
  FROM dev_aoc_catalog.gold.ga_fct_submissions
)
WHERE rn = 1;

DROP MATERIALIZED VIEW IF EXISTS dev_aoc_catalog.gold.ga_fct_submissions;

CREATE MATERIALIZED VIEW dev_aoc_catalog.gold.ga_fct_submissions AS
SELECT * FROM dev_aoc_catalog.gold.ga_fct_submissions_dedup;

DROP TABLE IF EXISTS dev_aoc_catalog.gold.ga_fct_submissions_dedup;
