# GA4 Sweaty Betty Load Intraday - Raw Data


**Revision History**<br>
Created 2/27/2025 Vish<br>
This notebook ingests raw data from google Analytics for Sweaty Betty


In [1]:
import concurrent.futures
from delta import *
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, LongType, DoubleType, BooleanType, MapType,IntegerType
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as F
import json
import base64
from datetime import datetime,timedelta
from time import sleep
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
from azure.storage.blob import BlobServiceClient
from pyspark.sql.functions import max as spark_max

In [2]:
!pip install google-cloud-bigquery
!pip install google-auth

In [3]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Run the common functions

In [4]:
%run /utils/common_functions

# Retrieve Google Big Query Credentials

In [5]:
project = "sb-dw-ga-bigquery-link"

token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary  
ga4_credentials = token_library.getSecret(kv_name, "GA4-SweatyBetty", "ls_kv_adap")  
print(ga4_credentials)

In [6]:
# Decode the base64-encoded string
decoded_key_file_content = base64.b64decode(ga4_credentials).decode('utf-8')
key_file_dict = json.loads(decoded_key_file_content)
credentials = service_account.Credentials.from_service_account_info(key_file_dict)
print("Trying to authenticate")
# Create BigQuery client with explicit authentication
client = bigquery.Client(credentials=credentials, project=credentials.project_id)
print("Authenticated")
print("Project ID:", client.project)

In [7]:
# Dictionary to store datasets and their tables
bq_data = {}

print("Trying to list datasets")
datasets = list(client.list_datasets())

if datasets:
    print("Datasets in the project:")
    for dataset in datasets:
        dataset_id = dataset.dataset_id
        print(f"- {dataset_id}")

        # Get all tables for the dataset
        tables = list(client.list_tables(dataset_id))
        table_names = [table.table_id for table in tables] if tables else []

        # Store in dictionary
        bq_data[dataset_id] = table_names
else:
    print("No datasets found in the project.")

# Get All tables in the dataset

In [8]:
# Dictionary to store datasets and their tables
bq_data = {}

print("Trying to list datasets")
datasets = list(client.list_datasets())

table_count = 0

if datasets:
    print("Datasets in the project:")
    for dataset in datasets:
        dataset_id = dataset.dataset_id
        print(f"- {dataset_id}")

        # Get all tables for the dataset
        tables = list(client.list_tables(dataset_id))
        table_names = [table.table_id for table in tables] if tables else []

        # Store in dictionary
        bq_data[dataset_id] = table_names

        # Print tables in the format catalog.tablename
        for table_name in table_names:
            print(f"{dataset_id}.{table_name}")
            table_count += 1

    print(f"Total number of tables: {table_count}")
else:
    print("No datasets found in the project.")

# Filter Tables for today

In [9]:
from datetime import datetime

# Get today's date in YYYYMMDD format
today_date = datetime.utcnow().strftime("%Y%m%d")

# Dictionary to store filtered datasets
filtered_bq_data = {}

# Iterate through the original bq_data
for dataset_id, table_list in bq_data.items():
    # Filter tables that contain today's date
    filtered_tables = [table for table in table_list if today_date in table]

    # Only add datasets that have matching tables
    if filtered_tables:
        filtered_bq_data[dataset_id] = filtered_tables

# Print filtered results
print("Filtered datasets and tables with today's date:")
for dataset, tables in filtered_bq_data.items():
    for table in tables:
        print(f"{dataset}.{table}")


# Filter Tables for a Date Range

In [10]:
'''
#This section can be uncommented and used to pull data for a specific range.
from datetime import datetime

# Define your date range (in YYYYMMDD format)
start_date = "20250401"
end_date = "20250417"

# Convert strings to datetime objects for comparison
start_dt = datetime.strptime(start_date, "%Y%m%d")
end_dt = datetime.strptime(end_date, "%Y%m%d")

# Dictionary to store filtered datasets
filtered_bq_data = {}

# Iterate through the original bq_data
for dataset_id, table_list in bq_data.items():
    filtered_tables = []

    for table in table_list:
        # Extract date from table name using 8-digit sequence
        for i in range(len(table) - 7):
            substr = table[i:i+8]
            if substr.isdigit():
                try:
                    table_date = datetime.strptime(substr, "%Y%m%d")
                    if start_dt <= table_date <= end_dt:
                        filtered_tables.append(table)
                        break  # Only add once per match
                except ValueError:
                    continue

    if filtered_tables:
        filtered_bq_data[dataset_id] = filtered_tables

# Print filtered results
print(f"Filtered datasets and tables between {start_date} and {end_date}:")
for dataset, tables in filtered_bq_data.items():
    for table in tables:
        print(f"{dataset}.{table}")
'''

In [11]:
json_blob_path =f"{raw_adls_path}/GA4_SweatyBetty/bigquery_datasets_tables.json"
base_folder = "GA4_SweatyBetty"

In [12]:
from pyspark.sql import SparkSession

def process_bigquery_tables(filtered_bq_data, raw_adls_path, base_folder, ga4_credentials):
    """
    Reads all tables from BigQuery as specified in the filtered_bq_data dictionary, 
    loads them into Spark DataFrames, and saves them as Parquet in ADLS.

    Parameters:
    filtered_bq_data (dict): Dictionary containing datasets and tables to process.
    raw_adls_path (str): Base Azure Data Lake Storage (ADLS) path.
    base_folder (str): Folder in ADLS where data will be saved.
    ga4_credentials (str): Google Analytics 4 credentials for BigQuery access.
    """
    spark = SparkSession.builder \
        .appName("BigQueryToParquet") \
        .getOrCreate()

    parent_project = "sb-dw-ga-bigquery-link"  # Update this to your actual parent project

    for dataset, tables in filtered_bq_data.items():
        for table_name in tables:
            try:
                print(f"Reading table: {dataset}.{table_name}")

                # Read from BigQuery
                df = spark.read.format("bigquery") \
                    .option("credentials", ga4_credentials) \
                    .option("parentProject", parent_project) \
                    .option("dataset", dataset) \
                    .option("table", table_name) \
                    .load()

                # Dataset is the project name (e.g., analytics_455108528)
                project_name = dataset

                # Extract dataset_name from table name (everything before last underscore and date)
                dataset_name = "_".join(table_name.split("_")[:-1])  # e.g., events from events_20250325

                output_folder = f"{raw_adls_path}{base_folder}/{project_name}/{dataset_name}/{table_name}"
                print(f"Saving table {dataset}.{table_name} to {output_folder}")

                df.write.format("parquet").mode("overwrite").save(output_folder)

                print(f"Successfully saved {dataset}.{table_name} to {output_folder}")

            except Exception as e:
                print(f"Error processing {dataset}.{table_name}: {str(e)}")

    print("All tables processed successfully!")



In [13]:
process_bigquery_tables(filtered_bq_data, raw_adls_path, base_folder, ga4_credentials)