In [1]:
pip install google-analytics-data

In [27]:
from google.oauth2.credentials import Credentials
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import *
from google.oauth2 import service_account
import json
from pyspark.sql.functions import * 
import concurrent.futures
from time import sleep 
from pyspark.sql.types import StructType, StructField, StringType, LongType,IntegerType, DateType,DoubleType, MapType, ArrayType

In [28]:
%run /utils/common_functions

# Define the variables

In [29]:
account_name = raw_adls_path.split('@')[1].split('.')[0]
gold_container = 'gold'
gold_wolverine_sessions_folder_API = 'GA4/Sessions_wolverine_summary_API_Final'
gold_delta_table_path_wolverine_API = f"abfss://{gold_container}@{account_name}.dfs.core.windows.net/{gold_wolverine_sessions_folder_API}"

#Valid Values for loadt_type are DateRange and Incremental
#If You provide DateRange then also provide value for start_date and end_date
load_type = "Incremental"
#load_type = "DateRange"

#example of start_date and end_date: 
start_date = "2024-01-01"
end_date = "2025-05-10"


In [30]:
print(gold_delta_table_path_wolverine_API)

## get start date range and end date range

In [31]:
#Logic based on Incremental and date range Switch
from datetime import datetime, timedelta
from pyspark.sql import SparkSession

def get_fiscal_weeks(start_date: str, end_date: str, jdbcDriver: str, jdbcUrl: str, jdbcUsername: str, jdbcPassword: str):
    query = f"""
        SELECT DISTINCT dd.fiscalweek, dd.weekbegindate, dd.weekenddate
        FROM report.DateDim dd
        WHERE daydate BETWEEN '{start_date}' AND '{end_date}'
    """

    df_date = spark.read.format("jdbc")\
        .option("driver", jdbcDriver)\
        .option("url", jdbcUrl)\
        .option("query", query)\
        .option("user", jdbcUsername)\
        .option("password", jdbcPassword)\
        .load()

    gua_dates_range = df_date.collect()
    if gua_dates_range:
        fiscal_week = gua_dates_range[0][0]
        week_begin_date = gua_dates_range[0][1]
        week_end_date = gua_dates_range[0][2]
        print("fiscal_week::", fiscal_week, "week_begin_date::", week_begin_date, "week_end_date::", week_end_date)
    else:
        print("No records found for given date range.")

    display(df_date)
    return df_date


def get_date_range_based_on_loadtype(LoadType: str, jdbcDriver: str, jdbcUrl: str, jdbcUsername: str, jdbcPassword: str,start_date: str, end_date: str):
    if LoadType == "DateRange":
        # Hardcoded date range
        return get_fiscal_weeks(start_date, end_date, jdbcDriver, jdbcUrl, jdbcUsername, jdbcPassword)

    elif LoadType == "Incremental":
        today = datetime.today()
        # Get last Sunday
        this_week_end = today - timedelta(days=today.weekday() + 1)
        this_week_start = this_week_end - timedelta(days=6)

        # Previous week
        prev_week_end = this_week_start - timedelta(days=1)
        prev_week_start = prev_week_end - timedelta(days=6)

        start_date = prev_week_start.strftime('%Y-%m-%d')
        end_date = this_week_end.strftime('%Y-%m-%d')

        print(f"Computed Incremental Date Range: {start_date} to {end_date}")
        return get_fiscal_weeks(start_date, end_date, jdbcDriver, jdbcUrl, jdbcUsername, jdbcPassword)

    else:
        raise ValueError("Invalid LoadType. Choose either 'DateRange' or 'Incremental'.")


# Incremental Load or Date Range
Normally we will run the incremental load. In case we need to backfill historical data we can run this with a specific date range.

In [32]:

#df_date = get_date_range_based_on_loadtype("DateRange", jdbcDriver, jdbcUrl, jdbcUsername, jdbcPassword, start_date, end_date)
# or
df_date = get_date_range_based_on_loadtype("Incremental", jdbcDriver, jdbcUrl, jdbcUsername, jdbcPassword, start_date, end_date)

gua_dates_range = df_date.collect()


History Run Log<br>
2024-12-29 - 2025-05-10<br>
2023-12-31 - 2024-12-28

# Defining Schema for GA4 Data

In [33]:
#Schema without financial fields
ga4_schema = StructType([
StructField('brand',StringType(), True)
,StructField('device_category',StringType(), True)
,StructField('sessions',IntegerType(), True)
,StructField('week_end',StringType(), True)
])

ga4_qty_schema = StructType([
StructField('brand',StringType(), True)
,StructField('device_category',StringType(), True)
#,StructField('quantity',IntegerType(), True)
,StructField('week_end',StringType(), True)
])
qty_data = []
non_qty_data = []

# List All Properties

In [34]:
#V3 only sessions

from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import RunReportRequest, DateRange, Metric, Dimension, Filter, FilterExpression
from google.oauth2 import service_account
import json
from time import sleep

# Full property_brand mapping
property_brand = {
    297479542: 'Saucony US', 309617502: 'Merrell US', 309589213: 'Chacos US', 309592771: 'Cat Footwear US',
    309607030: 'Harley-Davidson Footwear US', 309561793: 'Grasshoppers US', 309609458: 'Keds US',
    309650146: 'Hush Puppies US', 309592118: 'OnlineShoes US', 309626252: 'Prokeds US',
    309606225: 'Merrell BE', 309619975: 'Hytest US', 309616895: 'Hush Puppies Canada',
    309620743: 'Merrell ES', 309599233: 'Cat Footwear EMEA Emerging', 309591384: 'Wolverine US',
    309561205: 'Cat Footwear Canada', 309607124: 'Keds Canada', 309621431: 'Cat Footwear DE',
    309591477: 'Merrell SE', 309615440: 'Merrell DE', 309613069: 'Cat Footwear UK',
    309607810: 'Merrell Canada', 309628914: 'Merrell EMEA Emerging', 309607948: 'Saucony FR',
    309643444: 'Saucony EMEA Emerging', 309621834: 'Merrell NL', 309596198: 'Saucony Canada',
    309599329: 'Merrell UK', 309579645: 'Merrell FR', 309599656: 'Wolverine Canada',
    309624452: 'Saucony DE', 309600562: 'Saucony BE', 309629089: 'Saucony ES',
    309537329: 'Saucony IT', 309587387: 'Saucony UK', 309603632: 'Saucony NL',
    309713727: 'Bates US', 310709711: 'Saucony AT', 312016432: 'Dev',
    312419831: 'EMEA Roll Up', 312454140: 'Canada Roll Up', 312455238: 'Global Roll up',
    315639705: 'Merrell EMEA Roll Up', 315628392: 'Saucony EMEA Roll Up',
    315625200: 'Cat Footwear EMEA Roll Up', 428511278: 'Server-side GTM',
    433745652: 'Wolverine4Work', 440572276: 'WWW Meta Shopping',
    309632059: 'Sperry US', 309650574: 'Sperry Canada'
}

# Exclusion list
exclusion_list = {
    309713727, 309561793, 309607030, 309650146, 309609458,
    309607124, 309626252, 428511278, 309650574, 309632059,
    309599656, 309616895
}

all_properties = list(property_brand.keys())
property_list = [p for p in all_properties if p not in exclusion_list]

# Define all Functions

In [35]:
non_qty_data = []

metrics = [Metric(name="sessions")]
dimensions = [Dimension(name="deviceCategory")]

def initialize_analyticsreporting():
    file_text = spark.read.text(f"{raw_adls_path}GA4/credentials.json", wholetext=True)
    credential_data = file_text.collect()[0][0]
    credentials = service_account.Credentials.from_service_account_info(json.loads(credential_data))
    return BetaAnalyticsDataClient(credentials=credentials)

client = initialize_analyticsreporting()

def get_report(property_id, p_date_range, metric_list, max_retries=3):
    for attempt in range(max_retries):
        try:
            request = RunReportRequest(
                property=f"properties/{property_id}",
                date_ranges=[p_date_range],
                dimensions=dimensions,
                metrics=metric_list,
                dimension_filter=FilterExpression(
                    filter=Filter(
                        field_name="deviceCategory",
                        in_list_filter=Filter.InListFilter(values=["tablet", "desktop", "mobile"])
                    )
                ),
            )
            response = client.run_report(request)
            sleep(5)
            return response
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for property {property_id}: {e}")
            sleep(2 * (attempt + 1))
    raise RuntimeError(f"All {max_retries} attempts failed for property {property_id}")


def write_to_delta(df, delta_table_path: str):
    try:
        df.write.format("delta").mode("append").save(delta_table_path)
        print(f"Successfully wrote data to Delta table at: {delta_table_path}")
    except Exception as e:
        print(f"Failed to write to Delta table at {delta_table_path}: {e}")

def get_ga4_data(p_date_range, property_id):
    print(f"Date Range: {p_date_range.start_date} to {p_date_range.end_date}")
    print(f"Property ID: {property_id}")
    response = get_report(property_id, p_date_range, metrics)
    return extract_data(response, property_id, p_date_range.end_date)

def extract_data(response, property_id, p_end_date):
    result = []
    for row in response.rows:
        dimension_value = row.dimension_values[0].value
        sessions = int(row.metric_values[0].value)
        result.append((
            property_brand.get(property_id, f"Unknown-{property_id}"),
            dimension_value, sessions, p_end_date
        ))
    return result



# Call and process

In [36]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from pyspark.sql.functions import col, lit, regexp_replace, to_timestamp, date_format, to_utc_timestamp
from builtins import min  # Ensure Python's min is used

# Create a thread lock for writing to Delta
delta_write_lock = Lock()

def safe_write(df):
    with delta_write_lock:
        write_to_delta(df, gold_delta_table_path_wolverine_API)

#New Logic replace Canada as CA
from pyspark.sql.functions import col, regexp_replace, date_format, to_timestamp, to_utc_timestamp, lit

def process_date_range(k):
    date_range = DateRange(
        start_date=k.weekbegindate.isoformat(),
        end_date=k.weekenddate.isoformat()
    )
    print(f"\nProcessing Date Range: {date_range.start_date} to {date_range.end_date}")

    for property_index, property_id in enumerate(property_list, start=1):
        print(f"  Property {property_index} of {len(property_list)}: {property_id} - {property_brand.get(property_id, 'Unknown')}")

        # Thread-safe: get data as return value
        data = get_ga4_data(date_range, property_id)

        if data:
            df = spark.createDataFrame(data=data, schema=ga4_schema)

            final_df = (
                df
                .withColumnRenamed("device_category", "device_type")
                .withColumn("BrandCountryKey", regexp_replace(col("brand"), " ", "-"))
                .withColumn("BrandCountryKey", regexp_replace(col("BrandCountryKey"), "-Canada", "-CA"))
                .withColumn("calday", date_format(to_timestamp("week_end"), "yyyyMMdd"))
                .withColumn("est_date", to_timestamp("week_end"))
                .withColumn("gmt_date", to_utc_timestamp(col("est_date"), "America/New_York"))
                .withColumn("source", lit(None).cast("string"))
                .withColumn("medium", lit(None).cast("string"))
                .select("BrandCountryKey", "calday", "est_date", "gmt_date", "sessions", "device_type", "source", "medium")
            )

            safe_write(final_df)
        else:
            print(f"    No data returned for property {property_id} on {date_range.start_date}")


# Multi thread Execution

In [37]:
# Thread pool config â€” safe for medium Spark driver
num_threads = min(8, len(gua_dates_range))

# Execute threads
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_date_range, k) for k in gua_dates_range]

    for future in as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Error during threaded processing: {e}")

# Read Back Data and Deduplicate
If the code to get data from APIs gets run multiple times we may get duplicate data. The following code will read data, deduplicate it and save back.

In [38]:
def read_from_delta(delta_table_path: str):
    try:
        df = spark.read.format("delta").load(delta_table_path)
        print(f"Successfully read data from Delta table at: {delta_table_path}")
        return df
    except Exception as e:
        print(f"Failed to read from Delta table at {delta_table_path}: {e}")
        return None

def deduplicate_and_overwrite_all_columns(delta_table_path: str):
    df = read_from_delta(delta_table_path)
    if df is not None:
        # Deduplicate across all columns
        df_deduped = df.dropDuplicates()
        try:
            df_deduped.write.format("delta").mode("overwrite").save(delta_table_path)
            print(f"Successfully overwrote Delta table at: {delta_table_path} with deduplicated data")
        except Exception as e:
            print(f"Failed to overwrite Delta table at {delta_table_path}: {e}")




# Deduplicate with Total Sessions Rank
We see duplicates if the number of sessions changes during multiple runs. Hence we take the highest number of sessions.


In [39]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

def deduplicate_with_top_rank(delta_table_path: str):
    df_wolverine_api = read_from_delta(delta_table_path)
    # Define the window specification
    window_spec = Window.partitionBy("BrandCountryKey", "calday", "device_type") \
                        .orderBy(col("sessions").desc())

    # Assign row numbers within each partition
    ranked_df = df_wolverine_api.withColumn("row_num", row_number().over(window_spec))

    # Filter to keep only the top row per group
    df_top_sessions = ranked_df.filter(col("row_num") == 1).drop("row_num")
    try:
        df_top_sessions.write.format("delta").mode("overwrite").save(delta_table_path)
        print(f"Successfully overwrote Delta table at: {delta_table_path} with deduplicated data")
    except Exception as e:
        print(f"Failed to overwrite Delta table at {delta_table_path}: {e}")



In [40]:
deduplicate_and_overwrite_all_columns(gold_delta_table_path_wolverine_API)
deduplicate_with_top_rank(gold_delta_table_path_wolverine_API)

# Duplicate Verification

In [41]:
from pyspark.sql.functions import col

def verify_no_duplicates(delta_table_path: str):
    try:
        df = spark.read.format("delta").load(delta_table_path)
        total_count = df.count()
        distinct_count = df.distinct().count()
        
        print(f"Total rows: {total_count}")
        print(f"Distinct rows: {distinct_count}")
        
        if total_count == distinct_count:
            print("No duplicates found in the Delta table.")
        else:
            print("Duplicates still exist in the Delta table.")
    except Exception as e:
        print(f"Error verifying duplicates in Delta table at {delta_table_path}: {e}")

In [42]:
verify_no_duplicates(gold_delta_table_path_wolverine_API)