In [None]:
import concurrent.futures
from delta import *
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, LongType, DoubleType, BooleanType, MapType,IntegerType
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as F
import json
import base64
from datetime import datetime,timedelta
from time import sleep
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
from azure.storage.blob import BlobServiceClient
from pyspark.sql.functions import max as spark_max

# Run Common Functions

In [None]:
%run /utils/common_functions

# Define Variables

In [None]:
account_name = raw_adls_path.split('@')[1].split('.')[0]
sb_raw_base_folder = "GA4_SweatyBetty/analytics_292120381/events"
gold_container = 'gold'
bronze_container = 'bronze'
gold_sb_sessions_folder = 'GA4/Sessions_SweatyBetty'
bronze_sb_unique_sessions_folder = 'GA4/Sessions_SweatyBetty_unique_sessions'

gold_delta_table_path_sb = f"abfss://{gold_container}@{account_name}.dfs.core.windows.net/{gold_sb_sessions_folder}"
bronze_delta_table_path_unique_sessions_sb = f"abfss://{bronze_container}@{account_name}.dfs.core.windows.net/{bronze_sb_unique_sessions_folder}"



In [None]:
print(f"account_name: {account_name}")
print(f"sb_raw_base_folder: {sb_raw_base_folder}")
print(f"gold_container: {gold_container}")
print(f"gold_sb_sessions_folder: {gold_sb_sessions_folder}")
print(f"bronze_sb_unique_sessions_folder: {bronze_sb_unique_sessions_folder}")
print(f"gold_delta_table_path_sb: {gold_delta_table_path_sb}")
print(f"bronze_delta_table_path_unique_sessions_sb: {bronze_delta_table_path_unique_sessions_sb}")


In [None]:
keyvult_key = 'storage-key'
account_key = mssparkutils.credentials.getSecret(kv_name , keyvult_key,'ls_kv_adap' )
storage_account_name = raw_adls_path.split('@')[1].split('.')[0]
container_name = 'raw'

# Read raw data and get unique sessions 

In [None]:
#New Logic - But Separate from DBT

from pyspark.sql.functions import (
    col, from_unixtime, from_utc_timestamp,
    expr, lower, concat_ws, when, count, min, lit
)

def ga4_events_get_unique_sessions(df):
    # Set brand to static value
    brand_col = lit("sweatybetty")

    # Convert timestamps
    est_col = from_utc_timestamp(from_unixtime(col("event_timestamp") / 1_000_000), "America/New_York")
    gmt_col = from_unixtime(col("event_timestamp") / 1_000_000)

    # Handle country null or empty
    country_col = when(
        (col("geo.country").isNull()) | (col("geo.country") == ""), 
        "none"
    ).otherwise(col("geo.country")).alias("country")

    # Define country code column
    country_code_col = when(col("geo.country") == "Canada", "CA") \
        .when(col("geo.country") == "Ireland", "IE") \
        .when(col("geo.country") == "United States", "US") \
        .when(col("geo.country") == "United Kingdom", "UK") \
        .otherwise("ROW") \
        .alias("CountryCode")

    # Build base DataFrame
    df_sessions = df.select(
        col("event_date").alias("calday"),
        col("user_pseudo_id"),
        col("event_params")["ga_session_id"]["int_value"].alias("session_id"),
        col("device.category").alias("device_type"),
        country_col,
        country_code_col,
        est_col.alias("est_datetime"),
        gmt_col.alias("gmt_datetime"),
        concat_ws("-", brand_col, col("CountryCode")).alias("BrandCountryKey"),
        col("traffic_source.source").alias("source"),
        col("traffic_source.medium").alias("medium")
    )

    # Drop null session_id and deduplicate
    df_unique_sessions = df_sessions.dropna(subset=["session_id"]) \
        .dropDuplicates(["user_pseudo_id", "session_id", "calday"])

    # Aggregate
    '''
    df_result = df_unique_sessions.groupBy(
        "BrandCountryKey", "calday", "device_type", "source", "medium", "country", "CountryCode"
    ).agg(
        count("*").alias("sessions"),
        min("est_datetime").alias("est_date"),
        min("gmt_datetime").alias("gmt_date")
    )
    '''

    # Select final columns including user_pseudo_id and session_id
    df_result = df_unique_sessions.select(
        "BrandCountryKey",
        "calday",
        col("est_datetime").alias("est_date"),
        col("gmt_datetime").alias("gmt_date"),
        "user_pseudo_id",
        "session_id",
        "device_type",
        "source",
        "medium",
        "country",
        "CountryCode"
    )

    return df_result



# Define function to Save to Delta Lake

In [None]:
def write_to_delta(df, delta_table_path: str):
    try:
        df.write.format("delta").mode("append").save(delta_table_path)
        print(f"Successfully wrote data to Delta table at: {delta_table_path}")
    except Exception as e:
        print(f"Failed to write to Delta table at {delta_table_path}: {e}")


# Function to Iterate over Sweaty Betty Raw Data
Save Unique Sessions to Bronze

## Full Processing

In [None]:
print(f"bronze_delta_table_path_unique_sessions_sb: {bronze_delta_table_path_unique_sessions_sb}")

In [None]:
import re
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame

def process_SB_tables(raw_adls_path: str, base_folder: str, spark: SparkSession, bronze_delta_table_path_unique_sessions_sb: str):

    full_path = f"{raw_adls_path}{base_folder}".rstrip("/")

    try:
        entries = mssparkutils.fs.ls(full_path)
    except Exception as e:
        print(f"Error listing files in path: {full_path}\n{e}")
        return

    for entry in entries:
        folder_name = entry.name.rstrip("/")
        if re.match(r"events_\d{8}", folder_name):
            table_path = f"{full_path}/{folder_name}"
            try:
                df: DataFrame = spark.read.format("parquet").load(table_path)

                # === PLACEHOLDER FOR CUSTOM LOGIC ===
                #transformed_df = transform_ga4_events(df)
                unique_sessions_df = ga4_events_get_unique_sessions(df)
                print(f"Processing table to bronze: {folder_name}")
                write_to_delta(unique_sessions_df,bronze_delta_table_path_unique_sessions_sb)
                

            except Exception as read_err:
                print(f"Failed to process table {folder_name}: {read_err}")
                break


In [None]:
# This is full Processing Function call. It will process all the Raw files that are there, to the bronze
'''
process_SB_tables(
    raw_adls_path=raw_adls_path,
    base_folder=sb_raw_base_folder,
    spark=spark,
    bronze_delta_table_path_unique_sessions_sb = bronze_delta_table_path_unique_sessions_sb
)
'''


# Incremental Function for Sweaty Betty

In [None]:
import re
import json
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame
from azure.storage.blob import BlobServiceClient

def process_SB_tables_incremental(raw_adls_path: str, base_folder: str, spark: SparkSession,
                                   bronze_delta_table_path_unique_sessions_sb: str,
                                   storage_account_name: str, account_key: str):

    # Read watermark
    watermark_blob_path = "GA4/sb_sessions_watermark.json"
    blob_service_client = BlobServiceClient(account_url=f"https://{storage_account_name}.blob.core.windows.net",
                                            credential=account_key)
    container_client = blob_service_client.get_container_client("raw")
    blob_client = container_client.get_blob_client(watermark_blob_path)

    try:
        blob_data = blob_client.download_blob().readall()
        blob_content = json.loads(blob_data)
        last_processed = blob_content.get("last_processed_folder", "")
    except Exception as e:
        print("Could not read watermark. Defaulting to no previous folder.")
        last_processed = ""

    full_path = f"{raw_adls_path}{base_folder}".rstrip("/")

    try:
        entries = mssparkutils.fs.ls(full_path)
    except Exception as e:
        print(f"Error listing files in path: {full_path}\n{e}")
        return

    # Filter and sort folder list
    valid_folders = sorted([
        entry.name.rstrip("/") for entry in entries if re.match(r"events_\d{8}", entry.name)
    ])

    # Only process folders after watermark
    folders_to_process = [f for f in valid_folders if f > last_processed]

    if not folders_to_process:
        print("No new folders to process.")
        return

    for folder_name in folders_to_process:
        table_path = f"{full_path}/{folder_name}"
        try:
            df: DataFrame = spark.read.format("parquet").load(table_path)
            unique_sessions_df = ga4_events_get_unique_sessions(df)
            print(f"Processing table to bronze: {folder_name}")
            write_to_delta(unique_sessions_df,bronze_delta_table_path_unique_sessions_sb)

            # Update watermark after successful processing
            watermark_data = json.dumps({"last_processed_folder": folder_name})
            blob_client.upload_blob(watermark_data, overwrite=True)
            print(f"Watermark updated to: {folder_name}")

        except Exception as read_err:
            print(f"Failed to process table {folder_name}: {read_err}")
            break


In [None]:
print(f"account_name: {account_name}")
print(f"sb_raw_base_folder: {sb_raw_base_folder}")
print(f"gold_container: {gold_container}")
print(f"gold_sb_sessions_folder: {gold_sb_sessions_folder}")
print(f"bronze_sb_unique_sessions_folder: {bronze_sb_unique_sessions_folder}")
print(f"gold_delta_table_path_sb: {gold_delta_table_path_sb}")
print(f"bronze_delta_table_path_unique_sessions_sb: {bronze_delta_table_path_unique_sessions_sb}")


# Call Sweaty Betty Incremental function

In [None]:
process_SB_tables_incremental(raw_adls_path = raw_adls_path,
                              base_folder=sb_raw_base_folder,
                              bronze_delta_table_path_unique_sessions_sb = bronze_delta_table_path_unique_sessions_sb,
                              storage_account_name = storage_account_name,
                              account_key=account_key,
                              spark=spark
                              )

# Deduplicate Bronze data

In [None]:
def read_from_delta(delta_table_path: str):
    try:
        df = spark.read.format("delta").load(delta_table_path)
        print(f"Successfully read data from Delta table at: {delta_table_path}")
        return df
    except Exception as e:
        print(f"Failed to read from Delta table at {delta_table_path}: {e}")
        return None

def deduplicate_and_overwrite_all_columns(delta_table_path: str):
    df = read_from_delta(delta_table_path)
    if df is not None:
        # Deduplicate across all columns
        df_deduped = df.dropDuplicates()
        try:
            df_deduped.write.format("delta").mode("overwrite").save(delta_table_path)
            print(f"Successfully overwrote Delta table at: {delta_table_path} with deduplicated data")
        except Exception as e:
            print(f"Failed to overwrite Delta table at {delta_table_path}: {e}")

In [None]:
print(bronze_delta_table_path_unique_sessions_sb)

In [None]:
df = spark.read.format("delta").load(bronze_delta_table_path_unique_sessions_sb)

# Print the schema
df.printSchema()

In [None]:
deduplicate_and_overwrite_all_columns(bronze_delta_table_path_unique_sessions_sb)

In [None]:
#deduplicate based on calday
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

def deduplicate_on_specific_columns(delta_table_path: str):
    df = read_from_delta(delta_table_path)
    if df is not None:
        # Use calday instead of est_date
        window_spec = Window.partitionBy("calday", "session_id", "device_type","source") \
                            .orderBy(col("calday").desc())

        # Add a row number within each partition
        df_ranked = df.withColumn("row_num", row_number().over(window_spec))

        # Filter to keep only the first row per group
        df_deduped = df_ranked.filter(col("row_num") == 1).drop("row_num")

        try:
            df_deduped.write.format("delta").mode("overwrite").save(delta_table_path)
            print(f"Successfully overwrote Delta table at: {delta_table_path} with deduplicated data")
        except Exception as e:
            print(f"Failed to overwrite Delta table at {delta_table_path}: {e}")


In [None]:
deduplicate_on_specific_columns(bronze_delta_table_path_unique_sessions_sb)


# Process Gold Aggregate from Bronze

In [None]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import count, min

def read_from_delta(delta_table_path: str) -> DataFrame:
    try:
        df = spark.read.format("delta").load(delta_table_path)
        print(f"Successfully read data from Delta table at: {delta_table_path}")
        return df
    except Exception as e:
        print(f"Failed to read from Delta table at {delta_table_path}: {e}")
        return None

def write_to_delta_overwrite(df: DataFrame, delta_table_path: str):
    try:
        df.write.format("delta").mode("overwrite").save(delta_table_path)
        print(f"Successfully wrote data to Delta table at: {delta_table_path}")
    except Exception as e:
        print(f"Failed to write to Delta table at {delta_table_path}: {e}")

def process_sessions_and_write_to_gold(bronze_path: str, gold_path: str):
    df = read_from_delta(bronze_path)
    if df is None:
        print("Aborting: Failed to read from bronze path.")
        return

    try:
        df_result = df.groupBy(
            "BrandCountryKey", "calday", "device_type", "source", "medium", "country", "CountryCode"
        ).agg(
            count("*").alias("sessions"),
            min("est_date").alias("est_date"),
            min("gmt_date").alias("gmt_date")
        )
        write_to_delta_overwrite(df_result, gold_path)
    except Exception as e:
        print(f"Failed during processing or writing: {e}")


In [None]:
print(bronze_delta_table_path_unique_sessions_sb)
print(gold_delta_table_path_sb)

In [None]:
process_sessions_and_write_to_gold(
    bronze_delta_table_path_unique_sessions_sb,
    gold_delta_table_path_sb
)
