# GA4 Flat Table Load
There is a table with flattened google analytics Data. This Notebook ingests that data.

**Revision History**
Created 9/12/2024 Vish


In [None]:
import concurrent.futures
from delta import *
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, LongType, DoubleType, BooleanType, MapType,IntegerType
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as F
import json
import base64
from datetime import datetime,timedelta
from time import sleep
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
from azure.storage.blob import BlobServiceClient
from pyspark.sql.functions import max as spark_max

# Run the common functions

In [None]:
%run /utils/common_functions

# Retrieve Google Big Query Credentials

In [None]:
project = "ga360-connection-267115"

token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary  
ga4_credentials = token_library.getSecret(kv_name, "GA4-credentials", "ls_kv_adap")  
print(ga4_credentials)

# Set Load Type: Full or Incremental
If the load_type is set to full then we will query all the records from big query, and write in raw zone, in parquet format, and overwrite the existing data. A full load takes around 20 minutes.


If the load type is Incremental then we query new records based on a watermark variable and append the newer records.

In [None]:
#load_type = 'Full'
load_type = 'Incremental'
water_mark_timestamp = 1724817599998701

# Set folder location to save data to Raw

In [None]:
base_folder = 'GA4/www_prod'
output_folder = f'{raw_adls_path}{base_folder}'
print(output_folder)

# Set the water_mark variable
<u>**We query the raw table to find out the maximum event_timestamp. Then we can use that to query newer events from google big query**</u>
### Get the storage account Key
The storage account key is stored in the keyvault associated with Synpase workspace. The secret name in the key vault is 'storage-key'.

In [None]:
keyvult_key = 'storage-key'
account_key = mssparkutils.credentials.getSecret(kv_name , keyvult_key,'ls_kv_adap' )
storage_account_name = raw_adls_path.split('@')[1].split('.')[0]
container_name = 'raw'

### Create Blob Client

In [None]:
blob_service_client = BlobServiceClient(account_url=f"https://{storage_account_name}.blob.core.windows.net", credential=account_key)
container_name = 'raw'
container_client = blob_service_client.get_container_client(container_name)
blob_name = f"GA4/watermark.json"
blob_client = container_client.get_blob_client(blob_name)

### Read Blob Data

In [None]:
blob_data = blob_client.download_blob().readall()

# If the blob content is JSON, parse it
blob_content = json.loads(blob_data)
water_mark_timestamp = blob_content.get('max_event_timestamp', 0)

# Print the value to verify
print(water_mark_timestamp)

# Load data in Dataframe

In [None]:
    if load_type == 'Full':
        df_WWW_Prod = spark.read.format("bigquery")\
            .option("credentials",ga4_credentials)\
            .option("parentProject",'ga360-connection-267115')\
            .option("dataset","WWW_PROD")\
            .option("table","ga4_raw_events")\
            .load()
    elif load_type == 'Incremental':
        # Load data from BigQuery and filter based on event_timestamp
        df_WWW_Prod = spark.read.format("bigquery")\
        .option("credentials", ga4_credentials)\
        .option("parentProject", 'ga360-connection-267115')\
        .option("dataset", "WWW_PROD")\
        .option("table", "ga4_raw_events")\
        .load()\
        .filter(f"event_timestamp > {water_mark_timestamp}")

In [None]:
# Find the maximum event_timestamp, this will be used to update watermark
new_max_event_timestamp = df_WWW_Prod.agg(spark_max("event_timestamp")).collect()[0][0]

# Print the maximum event_timestamp
print(f"new_max_event_timestamp: {new_max_event_timestamp}")

In [None]:
df_WWW_Prod.count()
#Incremental count - 108679151
# Full Count -      2010847393
# Incrementa count2  118607626
# Incremental Count  258527943 Oct-11-2024

# Set folder location to save data to Raw

In [None]:
base_folder = 'GA4/www_prod'
output_folder = f'{raw_adls_path}{base_folder}'
print(output_folder)

# Save Data in Parquet format

In [None]:
if load_type == 'Full':
    df_WWW_Prod.write.format("parquet").mode("overwrite").save(output_folder)
elif load_type == 'Incremental':
    df_WWW_Prod.write.format("parquet").mode("append").save(output_folder)


# Update the Watermark file
We write the new timestamp to the file, so that the next time we pull data incrementally from this.

In [None]:
blob_data = blob_client.download_blob().readall()

# Parse the blob content as JSON
blob_content = json.loads(blob_data)

# Update the max_event_timestamp with the new value

blob_content['max_event_timestamp'] = new_max_event_timestamp

# Convert the updated JSON content back to a string
updated_blob_data = json.dumps(blob_content)

# Upload the updated content back to the blob
blob_client.upload_blob(updated_blob_data, overwrite=True)