# GA4 Sweaty Betty Process from Raw to Gold<br>
## Table 1 & Table 2


**Revision History**<br>
Created 2/27/2025 Vish<br>
Adding Gold layer 03/25/2025 Vish

This notebook processes two tables from Raw to Gold.
These tables are Session Hits and Sessions Tables




In [1]:
import concurrent.futures
from delta import *
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, LongType, DoubleType, BooleanType, MapType,IntegerType
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as F
import json
import base64
from datetime import datetime,timedelta
from time import sleep
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
from azure.storage.blob import BlobServiceClient
from pyspark.sql.functions import max as spark_max

In [2]:
!pip install google-cloud-bigquery
!pip install google-auth

In [3]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Run the common functions

In [4]:
%run /utils/common_functions

# Define Variables

In [5]:
account_name = raw_adls_path.split('@')[1].split('.')[0]
json_blob_path =f"{raw_adls_path}/GA4_SweatyBetty/bigquery_datasets_tables.json"
#base_folder = "GA4_SweatyBetty"
base_folder = "GA4_SweatyBetty/analytics_292120381/events"
events_fresh_base_folder = "GA4_SweatyBetty/analytics_292120381/events_fresh"
events_intraday_base_folder = "GA4_SweatyBetty/analytics_292120381/events_intraday"
gold_container = 'gold'

#Table Specific Variables
# Session Hits
session_hits_target_folder = '/GA4-SweatyBetty/session_hits/'
session_hits_delta_table_path = f"abfss://{gold_container}@{account_name}.dfs.core.windows.net/{session_hits_target_folder}"

# Session

session_target_folder = '/GA4-SweatyBetty/session/'
session_delta_table_path = f"abfss://{gold_container}@{account_name}.dfs.core.windows.net/{session_target_folder}"

# Product Revenue

product_revenue_target_folder = '/GA4-SweatyBetty/product_revenue/'
product_revenue_delta_table_path = f"abfss://{gold_container}@{account_name}.dfs.core.windows.net/{product_revenue_target_folder}"

# Print Variable Values

In [6]:
print(f"Account Name: {account_name}")
print(f"JSON Blob Path: {json_blob_path}")
print(f"Base Folder: {base_folder}")
print(f"Gold Container: {gold_container}")

# Table Specific Variables
# Session Hits
print(f"Session Hits Target Folder: {session_hits_target_folder}")
print(f"Session Hits Delta Table Path: {session_hits_delta_table_path}")

# Session
print(f"Session Target Folder: {session_target_folder}")
print(f"Session Delta Table Path: {session_delta_table_path}")

# Product Revenue
print(f"Product Revenue Target Folder: {product_revenue_target_folder}")
print(f"Product Revenue Delta Table Path: {product_revenue_delta_table_path}")


# Event Session Hits Processing

# Define Function to process events for Session Hits<br>
This function reads raw GA4 event data from Azure Data Lake (stored as Parquet), enriches it with session-level metadata, and transforms it into a structured format suitable for downstream analytics. It extracts key fields from nested structures, calculates session-specific metrics such as hit_number and visit_start_time, and generates a unique_key hash used for identifying each event uniquely across systems. The timestamp formatting is carefully adjusted to ensure compatibility with existing systems like Snowflake by preserving millisecond precision.<br>
Reads GA4 event data from a specified Parquet file in Azure Data Lake using a dynamic table path.<br>

Converts event_timestamp from microseconds to a Spark timestamp with millisecond precision for accurate time-based analytics and hash generation.<br>

Extracts specific fields from the nested event_params map, including both string and numeric types like page_path, event_value, and engagement_time_msec.<br>

Generates a session_hkey as an MD5 hash based on user_pseudo_id, ga_session_id, and ga_session_number.<br>

Calculates session-level metadata using window functions:<br>     • visit_start_time: earliest event timestamp per session<br>     • hit_number: sequential event number within a session<br>     • is_exit: flag for last event in a session<br>

Truncates visit_start_time to milliseconds (3 digits) to match Snowflake formatting.<br>

Builds a consistent unique_key by hashing session_hkey, visitor_key, hit_number, and visit_start_time — matching Snowflake's MD5 logic exactly.<br>

Returns a DataFrame with selected output columns ready for merge or analytics.<br>


In [7]:
#trying to match unique key 2
from pyspark.sql.functions import (
    col, md5, concat_ws, coalesce, row_number, min, max,
    lpad, hour, minute, lit, date_format
)
from pyspark.sql.window import Window

def process_event_table_session_hits(table_name: str, raw_adls_path: str, base_folder: str):
    full_path = f"{raw_adls_path}{base_folder}/{table_name}"
    df_events = spark.read.parquet(full_path)

    # Step 1: Convert event_timestamp to timestamp with millisecond precision
    df_events = df_events.withColumn(
        "event_timestamp_ts",
        (col("event_timestamp") / 1000000).cast("timestamp")
    )

    # Step 2: Extract string fields from event_params
    extract_fields = [
        "event_category", "event_action", "event_label", "value",
        "entrances", "page_title", "page_path", "page_location",
        "engagement_time_msec"
    ]
    for field in extract_fields:
        df_events = df_events.withColumn(f"ep_{field}", col("event_params").getItem(field).getField("string_value"))

    # Step 3: Extract numeric fields
    df_events = df_events.withColumn("ep_engagement_time_msec", col("event_params").getItem("engagement_time_msec").getField("int_value")) \
                         .withColumn("ep_entrances", col("event_params").getItem("entrances").getField("int_value")) \
                         .withColumn("ep_value", col("event_params").getItem("value").getField("double_value"))

    # Step 4: Compute session_hkey
    df_events = df_events.withColumn("session_hkey", md5(concat_ws("-",
        coalesce(col("user_pseudo_id").cast("string"), lit("")),
        coalesce(col("event_params").getItem("ga_session_id").getField("int_value").cast("string"), lit("")),
        coalesce(col("event_params").getItem("ga_session_number").getField("int_value").cast("string"), lit(""))
    )))

    # Step 5: Get visit_key
    df_events = df_events.withColumn("visit_key", col("event_params").getItem("ga_session_id").getField("int_value").cast("string"))

    # Step 6: Define windows
    session_window = Window.partitionBy("session_hkey").orderBy(
        col("event_timestamp_ts").asc(),
        col("ep_engagement_time_msec").asc()
    )
    session_group = Window.partitionBy("session_hkey")

    # Step 7: Calculate base fields
    df_events = df_events.withColumn("visitor_key", col("user_id")) \
        .withColumn("raw_visit_start_time", min("event_timestamp_ts").over(session_group)) \
        .withColumn("hit_number", row_number().over(session_window)) \
        .withColumn("hour_minute", concat_ws(":", hour("event_timestamp_ts"), lpad(minute("event_timestamp_ts"), 2, '0'))) \
        .withColumn("country_iso2", col("geo.country")) \
        .withColumn("is_entrance", col("ep_entrances")) \
        .withColumn("is_exit", (col("event_timestamp_ts") == max("event_timestamp_ts").over(session_group)).cast("int")) \
        .withColumn("page_pagetitle", col("ep_page_title")) \
        .withColumn("page_pagepath", col("ep_page_path")) \
        .withColumn("appinfo_landingscreenname", min("ep_page_location").over(session_group)) \
        .withColumn("eventinfo_eventaction", col("ep_event_action")) \
        .withColumn("eventinfo_eventcategory", col("ep_event_category")) \
        .withColumn("eventinfo_eventlabel", col("ep_event_label")) \
        .withColumn("eventinfo_eventvalue", col("ep_value"))

    # Step 8: Truncate visit_start_time to milliseconds (match Snowflake precision)
    df_events = df_events.withColumn(
        "visit_start_time",
        date_format("raw_visit_start_time", "yyyy-MM-dd HH:mm:ss.SSS").cast("timestamp")
    )

    # Step 9: Add effective_from, load_datetime, and unique_key
    df_events = df_events.withColumn("effective_from", col("visit_start_time")) \
        .withColumn("load_datetime", col("event_timestamp_ts")) \
        .withColumn("unique_key", md5(concat_ws("-",
            coalesce(col("session_hkey").cast("string"), lit("")),
            coalesce(col("visitor_key").cast("string"), lit("")),
            coalesce(col("hit_number").cast("string"), lit("")),
            coalesce(col("visit_start_time").cast("string"), lit(""))
        )))

    # Step 10: Select final output columns
    final_columns = [
        "event_timestamp", "event_name",
        "session_hkey", "visitor_key", "visit_key", "visit_start_time", "hit_number", "hour_minute",
        "country_iso2", "is_entrance", "is_exit", "page_pagetitle", "page_pagepath",
        "appinfo_landingscreenname", "eventinfo_eventaction", "eventinfo_eventcategory",
        "eventinfo_eventlabel", "eventinfo_eventvalue", "effective_from", "load_datetime", "unique_key"
    ]

    return df_events.select(*final_columns)


# Save function with merge

In [8]:
#Merge 2
from delta.tables import DeltaTable
from pyspark.sql.functions import col

def save_to_delta_merge(df, spark, raw_adls_path: str, delta_table_path: str):
    account_name = raw_adls_path.split('@')[1].split('.')[0]

    if DeltaTable.isDeltaTable(spark, delta_table_path):
        delta_table = DeltaTable.forPath(spark, delta_table_path)

        merge_condition = (
            (col("target.unique_key") == col("source.unique_key"))

        )

        delta_table.alias("target").merge(
            source=df.alias("source"),
            condition=merge_condition
        ).whenMatchedUpdateAll() \
         .whenNotMatchedInsertAll() \
         .execute()
    else:
        df.write.format("delta").mode("overwrite").save(delta_table_path)


# Define process_and_save_date_range_session_hits

In [9]:
from datetime import datetime, timedelta

def process_and_save_date_range_session_hits(
    raw_adls_path: str,
    base_folder: str,
    delta_base_path: str,
    start_date: str = None,
    end_date: str = None,
    file_prefix: str = "events_"  # Default prefix
):
    if not start_date or not end_date:
        yesterday = (datetime.utcnow() - timedelta(days=1)).strftime("%Y%m%d")
        start_date = end_date = yesterday
    
    start = datetime.strptime(start_date, "%Y%m%d")
    end = datetime.strptime(end_date, "%Y%m%d")
    
    current = start
    while current <= end:
        date_str = current.strftime("%Y%m%d")
        table_name = f"{file_prefix}{date_str}"
        print(f"Processing table: {table_name}")
        
        try:
            df_processed = process_event_table_session_hits(table_name, raw_adls_path, base_folder)
            delta_table_path = f"{delta_base_path}"
            save_to_delta_merge(df_processed, spark,raw_adls_path, delta_table_path)
            print(f"Saved to {delta_table_path}")
        except Exception as e:
            print(f"Error processing {table_name}: {e}")
        
        current += timedelta(days=1)


# Define Function to save data

In [10]:
#Save with Append
def save_to_delta(df, raw_adls_path: str, delta_table_path: str):
    account_name = raw_adls_path.split('@')[1].split('.')[0]
    df.write.format("delta") \
        .mode("append") \
       .save(delta_table_path)


# Process Session Hits

In [11]:
print(raw_adls_path)
print(base_folder)
print(session_hits_delta_table_path)

## Process Intraday Tables - Session Hits
The tables with prefix events_fresh_

In [12]:
from datetime import datetime

# Compute today's date
today = datetime.today().strftime("%Y%m%d")

# Try running the function for today's date
try:
    process_and_save_date_range_session_hits(
        raw_adls_path=raw_adls_path,
        base_folder=events_fresh_base_folder,
        delta_base_path=session_hits_delta_table_path,
        start_date=today,
        end_date=today,
        file_prefix="events_fresh_"
    )
except Exception as e:
    print(f"No table found or error occurred for date {today}: {e}")



## Event files with prefix events_intraday
for example analytics_292120381.events_intraday_20250418

In [13]:
from datetime import datetime

# Compute today's date
today = datetime.today().strftime("%Y%m%d")

# Try running the function for today's date
try:
    process_and_save_date_range_session_hits(
        raw_adls_path=raw_adls_path,
        base_folder=events_intraday_base_folder,
        delta_base_path=session_hits_delta_table_path,
        start_date=today,
        end_date=today,
        file_prefix="events_intraday_"
    )
except Exception as e:
    print(f"No table found or error occurred for date {today}: {e}")
    raise

# Process Intraday tables in a specific Date range

In [14]:
#The following code can be uncommented to process a specific date range
'''
#Intraday tables are processed several times during the day
process_and_save_date_range_session_hits(
    raw_adls_path=raw_adls_path,
    base_folder=base_folder,
    delta_base_path=session_hits_delta_table_path,
    start_date="20250301",
    end_date="20250331",
    file_prefix = "events_fresh_"
)
'''

# Table 2 Sessions

## Define Currency exchange dataframe
We will use this as stub

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType
from datetime import datetime

# Start Spark session (if not already started)
spark = SparkSession.builder.getOrCreate()

# Define schema
schema = StructType([
    StructField("base_currency_code", StringType(), True),
    StructField("target_currency_code", StringType(), True),
    StructField("exchange_rate_amount", DoubleType(), True),
    StructField("valid_from", TimestampType(), True),
    StructField("valid_to", TimestampType(), True),
])

# Create static data
data = [
    ("USD", "GBP", 0.79, datetime(2024, 1, 1), datetime(2025, 12, 31)),
    ("EUR", "GBP", 0.85, datetime(2024, 1, 1), datetime(2025, 12, 31)),
    ("CAD", "GBP", 0.59, datetime(2024, 1, 1), datetime(2025, 12, 31)),
    ("AUD", "GBP", 0.52, datetime(2024, 1, 1), datetime(2025, 12, 31)),
    ("GBP", "GBP", 1.00, datetime(2024, 1, 1), datetime(2025, 12, 31)),  # Optional: identity
]

# Create DataFrame
exchange_rate_df = spark.createDataFrame(data, schema)

# Show result
display(exchange_rate_df)


## Define Table Name Variable

# Define Function to Process Sessions

In [16]:
from pyspark.sql.functions import (
    col, from_unixtime, md5, concat_ws, coalesce, row_number,
    min, max, first, countDistinct, lit, when
)
from pyspark.sql.window import Window

In [17]:
#Code to be tested
from pyspark.sql.functions import *
from pyspark.sql.window import Window

def process_session_table(table_name: str, raw_adls_path: str, base_folder: str, exchange_rate_df):
    full_path = f"{raw_adls_path}{base_folder}/{table_name}"
    df_events = spark.read.parquet(full_path)

    # Create session_hkey
    df_events = df_events.withColumn("session_hkey", md5(concat_ws("-", 
        coalesce(col("user_pseudo_id").cast("string"), lit("")), 
        coalesce(col("event_params").getItem("ga_session_id").getField("string_value"),
                 col("event_params").getItem("ga_session_id").getField("int_value").cast("string"), lit("")), 
        coalesce(col("event_params").getItem("ga_session_number").getField("string_value"),
                 col("event_params").getItem("ga_session_number").getField("int_value").cast("string"), lit(""))
    )))

    # Define windows
    session_group = Window.partitionBy("session_hkey").orderBy("event_timestamp")
    session_partition = Window.partitionBy("session_hkey")

    # Add session-level fields
    df_sessions_base = df_events \
        .withColumn("session_start_time_raw", min("event_timestamp").over(session_partition)) \
        .withColumn("visitor_key", first(when(col("user_id") != "undefined", col("user_id")), ignorenulls=True).over(session_group)) \
        .withColumn("visit_key", first(
            coalesce(
                col("event_params").getItem("ga_session_id").getField("string_value"),
                col("event_params").getItem("ga_session_id").getField("int_value").cast("string")
            ), ignorenulls=True
        ).over(session_group)) \
        .withColumn("client_id", first("user_pseudo_id", ignorenulls=True).over(session_group)) \
        .withColumn("total_visits", first(
            coalesce(
                col("event_params").getItem("session_engaged").getField("string_value"),
                col("event_params").getItem("session_engaged").getField("int_value").cast("string")
            ), ignorenulls=True).over(session_group)) \
        .withColumn("campaign", first(col("event_params").getItem("campaign").getField("string_value"), ignorenulls=True).over(session_group)) \
        .withColumn("source", first(col("event_params").getItem("source").getField("string_value"), ignorenulls=True).over(session_group)) \
        .withColumn("medium", first(col("event_params").getItem("medium").getField("string_value"), ignorenulls=True).over(session_group)) \
        .withColumn("referrer_path", first(col("event_params").getItem("page_referrer").getField("string_value"), ignorenulls=True).over(session_group)) \
        .withColumn("transaction_currency", max(when(col("event_name") == "purchase", 
            coalesce(col("event_params").getItem("currency").getField("string_value"),
                     col("event_params").getItem("currency").getField("int_value").cast("string"))
        )).over(session_partition)) \
        .withColumn("total_transaction_revenue_local_currency", max(when(col("event_name") == "purchase", col("ecommerce.purchase_revenue"))).over(session_partition)) \
        .withColumn("total_transaction_revenue_usd", max(when(col("event_name") == "purchase", col("ecommerce.purchase_revenue_in_usd"))).over(session_partition)) \
        .withColumn("units", max(when(col("event_name") == "purchase", col("ecommerce.total_item_quantity"))).over(session_partition)) \
        .withColumn("load_datetime", from_unixtime(col("event_server_timestamp_offset") / 1000000)) \
        .withColumn("global_customer_hkey", first(col("user_properties").getItem("global_customer_id").getField("string_value"), ignorenulls=True).over(session_group)) \
        .withColumn("country_iso2", first(col("event_params").getItem("country_iso_2").getField("string_value"), ignorenulls=True).over(session_group)) \
        .withColumn("new_returning", first(col("user_properties").getItem("customer_existing").getField("string_value"), ignorenulls=True).over(session_group)) \
        .withColumn("device_category", first(col("device.category"), ignorenulls=True).over(session_group)) \
        .withColumn("device_language", first(col("device.language"), ignorenulls=True).over(session_group)) \
        .withColumn("customer_channel_grouping", first(col("event_params").getItem("custom_channel_grouping").getField("string_value"), ignorenulls=True).over(session_group)) \
        .withColumn("trafficsource_adwordsclickinfo_customerid", first(col("collected_traffic_source.gclid"), ignorenulls=True).over(session_group)) \
        .withColumn("country", first(col("geo.country"), ignorenulls=True).over(session_group)) \
        .withColumn("state", first(col("geo.region"), ignorenulls=True).over(session_group)) \
        .withColumn("city", first(col("geo.city"), ignorenulls=True).over(session_group))

    # Format timestamps
    df_sessions_base = df_sessions_base \
        .withColumn("session_start_time", date_format((col("session_start_time_raw") / 1000000).cast("timestamp"), "yyyy-MM-dd HH:mm:ss.SSS").cast("timestamp")) \
        .withColumn("session_start_time_str", date_format((col("session_start_time_raw") / 1000000).cast("timestamp"), "yyyy-MM-dd HH:mm:ss.SSS")) \
        .withColumn("load_datetime", date_format(col("load_datetime"), "yyyy-MM-dd HH:mm:ss.SSS").cast("timestamp"))

    # Transaction counts
    transaction_counts = df_events \
        .filter(col("event_name") == "purchase") \
        .withColumn("transaction_id", col("event_params").getItem("transaction_id").getField("string_value")) \
        .filter(col("transaction_id").isNotNull()) \
        .select("session_hkey", "transaction_id").dropDuplicates() \
        .groupBy("session_hkey").agg(countDistinct("transaction_id").alias("transactions"))

    # Pageview counts
    pageview_counts = df_events \
        .filter(col("event_name") == "page_view") \
        .withColumn("page_location", col("event_params").getItem("page_location").getField("string_value")) \
        .filter(col("page_location").isNotNull()) \
        .select("session_hkey", "page_location").dropDuplicates() \
        .groupBy("session_hkey").agg(countDistinct("page_location").alias("session_pageviews"))

    # Join counts
    df_sessions = df_sessions_base \
        .join(transaction_counts, on="session_hkey", how="left") \
        .join(pageview_counts, on="session_hkey", how="left")

    # Join with exchange rates
    df_sessions = df_sessions.alias("s").join(
        exchange_rate_df.alias("er"),
        (
            (col("s.transaction_currency") == col("er.base_currency_code")) &
            (col("s.session_start_time").between(col("er.valid_from"), col("er.valid_to"))) &
            (col("er.target_currency_code") == lit("GBP"))
        ),
        how="left"
    ).withColumn(
        "total_transaction_revenue_gbp",
        when(col("transaction_currency") == "GBP", col("total_transaction_revenue_local_currency"))
        .otherwise(col("total_transaction_revenue_local_currency") * coalesce(col("er.exchange_rate_amount"), lit(1)))
    )

    # Unique key using session_start_time_str for strict precision
    df_sessions = df_sessions.withColumn(
        "unique_key",
        md5(concat_ws("-",
            coalesce(col("session_hkey").cast("string"), lit("")),
            coalesce(col("visitor_key").cast("string"), lit("")),
            coalesce(col("client_id").cast("string"), lit("")),
            coalesce(col("session_start_time_str"), lit("")),
            coalesce(col("transaction_currency").cast("string"), lit(""))
        ))
    )

    # Final columns
    final_cols = [
        "session_hkey", "session_start_time", "session_start_time_str", "visitor_key", "visit_key", "client_id",
        "total_visits", "global_customer_hkey", "country_iso2", "new_returning",
        "referrer_path", "device_category", "device_language", "customer_channel_grouping",
        "campaign", "source", "medium", "trafficsource_adwordsclickinfo_customerid",
        "country", "state", "city", "session_pageviews", "transactions", "units",
        "transaction_currency", "total_transaction_revenue_local_currency",
        "total_transaction_revenue_usd", "total_transaction_revenue_gbp",
        "load_datetime", "unique_key"
    ]

    return df_sessions.select(*final_cols)


# Define Date range function 
## Intraday Session Processing

In [18]:
from datetime import datetime, timedelta

def process_and_save_date_range_session(
    raw_adls_path: str,
    base_folder: str,
    delta_base_path: str,
    exchange_rate_df: DataFrame,
    start_date: str = None,
    end_date: str = None
    
):
    if not start_date or not end_date:
        yesterday = (datetime.utcnow() - timedelta(days=1)).strftime("%Y%m%d")
        start_date = end_date = yesterday
    
    start = datetime.strptime(start_date, "%Y%m%d")
    end = datetime.strptime(end_date, "%Y%m%d")
    
    current = start
    while current <= end:
        date_str = current.strftime("%Y%m%d")
        #events_fresh_ will process Intra Day tables
        table_name = f"events_fresh_{date_str}"
        print(f"Processing table: {table_name}")
        
        try:
            #process_session_table(table_name: str, raw_adls_path: str, base_folder: str, exchange_rate_df):
            df_processed = process_session_table(table_name, raw_adls_path, events_fresh_base_folder,exchange_rate_df)
            df_processed = df_processed.dropDuplicates()
            delta_table_path = f"{delta_base_path}"
            save_to_delta_merge(df_processed,spark, raw_adls_path, delta_table_path)
            print(f"Saved to {delta_table_path}")
            current += timedelta(days=1)
            
        except Exception as e:
            print(f"Error processing {table_name}: {e}")
            

        #Process table that starts with prefix Intraday
        table_name = f"events_intraday_{date_str}"
        print(f"Processing table: {table_name}")
        
        try:
            #process_session_table(table_name: str, raw_adls_path: str, base_folder: str, exchange_rate_df):
            df_processed = process_session_table(table_name, raw_adls_path, events_intraday_base_folder,exchange_rate_df)
            df_processed = df_processed.dropDuplicates()
            delta_table_path = f"{delta_base_path}"
            save_to_delta_merge(df_processed,spark, raw_adls_path, delta_table_path)
            print(f"Saved to {delta_table_path}")
            current += timedelta(days=1)
            
        except Exception as e:
            print(f"Error processing {table_name}: {e}")
            raise
        
        current += timedelta(days=1)    
        
        
        

In [19]:
print(raw_adls_path)
print(base_folder)
print(session_delta_table_path)

# Process Session Intraday for Today

In [1]:
from datetime import datetime

# Compute today's date
today = datetime.today().strftime("%Y%m%d")

# Try running the function for today's date
try:
    process_and_save_date_range_session(
        raw_adls_path=raw_adls_path,
        base_folder=base_folder,
        delta_base_path=session_delta_table_path,
        exchange_rate_df=exchange_rate_df,
        start_date=today,
        end_date=today
    )
except Exception as e:
    print(f"No table found or error occurred for date {today}: {e}")
    raise


# Process Session for a Date Range

In [21]:
#The following code can be uncommented to process data for a specific date range
'''
process_and_save_date_range_session(
    raw_adls_path=raw_adls_path,
    base_folder=base_folder,
    delta_base_path=session_delta_table_path,
    exchange_rate_df=exchange_rate_df,
    start_date="20250325",
    end_date="20250331"    
)
'''

# Define function to delete the delta table <br>
We can delte delta table for a clean run

In [22]:
from delta.tables import DeltaTable

def truncate_delta_table(spark, delta_table_path: str):
    try:
        delta_table = DeltaTable.forPath(spark, delta_table_path)
        delta_table.delete()  # No condition = delete all rows
        print(f"All data deleted from Delta table at {delta_table_path}")
    except Exception as e:
        print(f"Failed to truncate Delta table at {delta_table_path}: {e}")
