## Revision History

In [None]:
# Change_date         revision_number     change_description                           author
# 02/08/2024          1                   initial check-in                             Kranthi
# 05/03/2024          2                    made it dynamic to load for GUA and GA4      Kranthi
## first load for one date - then generate the schema using printSchema- create its structype for all the fields
## then start loading for all partitions. Create static schema first and then load all partitions in parallel. 
##The following things were tried and gave error::
## 1. with out schema defination, with out empty table table - load all partitions in parallel - protocol version error
## 2. first load 20230205 and then load all partitions in parllel- metadata changed by concurrent update error
## 3. create empty table with schema defined - then load all partitions in parallel - metadata changed by concurrent update error.
## 4. create empty table with schema defined - load single partition 20240501 - then load all other partitions - worked successfully
## Similarly for GUA , create the schema and load single 20230204 - then load all other partitions.

In [None]:
import concurrent.futures
from delta import *
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, LongType, DoubleType, BooleanType, MapType,IntegerType
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as F
import json
import base64
from datetime import datetime,timedelta
from time import sleep
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")

In [None]:
%run /utils/common_functions

## define GA4 Events table schema and create empty table

In [None]:
# Define the schema
ga4_schema = StructType([
    StructField("event_date", StringType(), True),
    StructField("event_timestamp", LongType(), True),
    StructField("event_name", StringType(), True),
    StructField("event_params", MapType(StringType(), StructType([
        StructField("string_value", StringType(), True),
        StructField("int_value", LongType(), True),
        StructField("float_value", DoubleType(), True),
        StructField("double_value", DoubleType(), True)
    ])), True),
    StructField("event_previous_timestamp", LongType(), True),
    StructField("event_value_in_usd", DoubleType(), True),
    StructField("event_bundle_sequence_id", LongType(), True),
    StructField("event_server_timestamp_offset", LongType(), True),
    StructField("user_id", StringType(), True),
    StructField("user_pseudo_id", StringType(), True),
    StructField("privacy_info", StructType([
        StructField("analytics_storage", StringType(), True),
        StructField("ads_storage", StringType(), True),
        StructField("uses_transient_token", StringType(), True)
    ]), True),
    StructField("user_properties", MapType(StringType(), StructType([
        StructField("string_value", StringType(), True),
        StructField("int_value", LongType(), True),
        StructField("float_value", DoubleType(), True),
        StructField("double_value", DoubleType(), True),
        StructField("set_timestamp_micros", LongType(), True)
    ])), True),
    StructField("user_first_touch_timestamp", LongType(), True),
    StructField("user_ltv", StructType([
        StructField("revenue", DoubleType(), True),
        StructField("currency", StringType(), True)
    ]), True),
    StructField("device", StructType([
        StructField("category", StringType(), True),
        StructField("mobile_brand_name", StringType(), True),
        StructField("mobile_model_name", StringType(), True),
        StructField("mobile_marketing_name", StringType(), True),
        StructField("mobile_os_hardware_model", StringType(), True),
        StructField("operating_system", StringType(), True),
        StructField("operating_system_version", StringType(), True),
        StructField("vendor_id", StringType(), True),
        StructField("advertising_id", StringType(), True),
        StructField("language", StringType(), True),
        StructField("is_limited_ad_tracking", StringType(), True),
        StructField("time_zone_offset_seconds", LongType(), True),
        StructField("browser", StringType(), True),
        StructField("browser_version", StringType(), True),
        StructField("web_info", StructType([
            StructField("browser", StringType(), True),
            StructField("browser_version", StringType(), True),
            StructField("hostname", StringType(), True)
        ]), True)
    ]), True),
    StructField("geo", StructType([
        StructField("city", StringType(), True),
        StructField("country", StringType(), True),        
        StructField("continent", StringType(), True),
        StructField("region", StringType(), True),        
        StructField("sub_continent", StringType(), True),
        StructField("metro", StringType(), True)
    ]), True),
    StructField("app_info", StructType([
        StructField("id", StringType(), True),
        StructField("version", StringType(), True),
        StructField("install_store", StringType(), True),
        StructField("firebase_app_id", StringType(), True),
        StructField("install_source", StringType(), True)
    ]), True),
    StructField("traffic_source", StructType([
        StructField("name", StringType(), True),
        StructField("medium", StringType(), True),
        StructField("source", StringType(), True)
    ]), True),
    StructField("stream_id", StringType(), True),
    StructField("platform", StringType(), True),
    StructField("event_dimensions", StructType([
        StructField("hostname", StringType(), True)
    ]), True),
    StructField("ecommerce", StructType([
        StructField("total_item_quantity", LongType(), True),
        StructField("purchase_revenue_in_usd", DoubleType(), True),
        StructField("purchase_revenue", DoubleType(), True),
        StructField("refund_value_in_usd", DoubleType(), True),
        StructField("refund_value", DoubleType(), True),
        StructField("shipping_value_in_usd", DoubleType(), True),
        StructField("shipping_value", DoubleType(), True),
        StructField("tax_value_in_usd", DoubleType(), True),
        StructField("tax_value", DoubleType(), True),
        StructField("unique_items", LongType(), True),
        StructField("transaction_id", StringType(), True),
        StructField("items", ArrayType(StructType([
            StructField("item_id", StringType(), True),
            StructField("item_name", StringType(), True),
            StructField("item_brand", StringType(), True),
            StructField("item_variant", StringType(), True),
            StructField("item_category", StringType(), True),
            StructField("item_category2", StringType(), True),
            StructField("item_category3", StringType(), True),
            StructField("item_category4", StringType(), True),
            StructField("item_category5", StringType(), True),
            StructField("price_in_usd", DoubleType(), True),
            StructField("price", DoubleType(), True),
            StructField("quantity", LongType(), True),
            StructField("item_revenue_in_usd", DoubleType(), True),
            StructField("item_revenue", DoubleType(), True),
            StructField("item_refund_in_usd", DoubleType(), True),
            StructField("item_refund", DoubleType(), True),
            StructField("coupon", StringType(), True),
            StructField("affiliation", StringType(), True),
            StructField("location_id", StringType(), True),
            StructField("item_list_id", StringType(), True),
            StructField("item_list_name", StringType(), True),
            StructField("item_list_index", StringType(), True),
            StructField("promotion_id", StringType(), True),
            StructField("promotion_name", StringType(), True),
            StructField("creative_name", StringType(), True),
            StructField("creative_slot", StringType(), True),
            StructField("item_params", MapType(StringType(), StructType([
                StructField("string_value", StringType(), True),
                StructField("int_value", LongType(), True),
                StructField("float_value", DoubleType(), True),
                StructField("double_value", DoubleType(), True)
            ])), True)
        ])), True)
    ]), True),
    StructField("collected_traffic_source", StructType([
        StructField("manual_campaign_id", StringType(), True),
        StructField("manual_campaign_name", StringType(), True),
        StructField("manual_source", StringType(), True),
        StructField("manual_medium", StringType(), True),
        StructField("manual_term", StringType(), True),
        StructField("manual_content", StringType(), True),
        StructField("gclid", StringType(), True),
        StructField("dclid", StringType(), True),
        StructField("srsltid", StringType(), True)
    ]), True),
    StructField("is_active_user", BooleanType(), True),
    StructField("property_brand", StringType(), True),
    StructField('brand_country', StringType(), True),
    StructField('lineage_source',StringType(), True)
   # 
    
])

# Create DataFrame with empty RDD and defined schema
empty_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), ga4_schema)

# Register DataFrame as a temporary view
empty_df.createOrReplaceTempView("ga4_events_temp")

# Use Spark SQL to create a table
spark.sql(f"""
CREATE TABLE if not exists raw.ga4_events
USING DELTA
PARTITIONED BY (event_date,property_brand,brand_country)
LOCATION '{raw_adls_path}GA4/events/'
as select * from ga4_events_temp
""")








## define GUA schema

In [None]:
# Define the schema
gua_schema = StructType([
    StructField("event_date", StringType(), nullable=True),
    StructField("event_timestamp", LongType(), nullable=True),
    StructField("event_name", StringType(), nullable=True),
    StructField("event_params", MapType(StringType(), 
        StructType([
            StructField("string_value", StringType(), nullable=True),
            StructField("int_value", LongType(), nullable=True),
            StructField("double_value", DoubleType(), nullable=True),
            StructField("float_value", DoubleType(), nullable=True)
        ]))),
    StructField("event_previous_timestamp", LongType(), nullable=True),
    StructField("event_value_in_usd",  LongType(), nullable=True),
    StructField("event_bundle_sequence_id", LongType(), nullable=True),
    StructField("event_server_timestamp_offset", LongType(), nullable=True),
    StructField("user_id", StringType(), nullable=True),
    StructField("user_pseudo_id", StringType(), nullable=True),
    StructField("privacy_info", StructType([
        StructField("ads_storage", StringType(), nullable=True),
        StructField("analytics_storage", StringType(), nullable=True),
        StructField("uses_transient_token", StringType(), nullable=True)
    ]), nullable=True),
    StructField("user_properties", MapType(StringType(), 
        StructType([
            StructField("string_value", StringType(), nullable=True),
            StructField("int_value", LongType(), nullable=True),
            StructField("double_value", DoubleType(), nullable=True),
            StructField("float_value", DoubleType(), nullable=True)
        ]))),
    StructField("user_first_touch_timestamp", LongType(), nullable=True),
    StructField("user_ltv", StructType([
        StructField("revenue", DoubleType(), nullable=True),
        StructField("currency", StringType(), nullable=True)
    ]), nullable=True),
    StructField("device", StructType([
        StructField("category", StringType(), nullable=True),
        StructField("mobile_brand_name", StringType(), nullable=True),
        StructField("mobile_model_name", StringType(), nullable=True),
        StructField("mobile_marketing_name", StringType(), nullable=True),
        StructField("mobile_os_hardware_model", StringType(), nullable=True),
        StructField("operating_system", StringType(), nullable=True),
        StructField("operating_system_version", StringType(), nullable=True),
        StructField("vendor_id", StringType(), nullable=True),
        StructField("advertising_id", StringType(), nullable=True),
        StructField("language", StringType(), nullable=True),
        StructField("time_zone_offset_seconds", LongType(), nullable=True),
        StructField("is_limited_ad_tracking", BooleanType(), nullable=True),
        StructField("web_info", StructType([
            StructField("browser", StringType(), nullable=True),
            StructField("browser_version", StringType(), nullable=True),
            StructField("hostname", StringType(), nullable=True)
        ]), nullable=True)
    ]), nullable=True),
    StructField("geo", StructType([
        StructField("continent", StringType(), nullable=True),
        StructField("sub_continent", StringType(), nullable=True),
        StructField("country", StringType(), nullable=True),
        StructField("region", StringType(), nullable=True),
        StructField("metro", StringType(), nullable=True),
        StructField("city", StringType(), nullable=True)
    ]), nullable=True),
    StructField("app_info", StructType([
        StructField("id", StringType(), nullable=True),
        StructField("version", StringType(), nullable=True),
        StructField("install_store", StringType(), nullable=True),
        StructField("firebase_app_id", StringType(), nullable=True),
        StructField("install_source", StringType(), nullable=True)
    ]), nullable=True),
    StructField("traffic_source", StructType([
        StructField("name", StringType(), nullable=True),
        StructField("medium", StringType(), nullable=True),
        StructField("source", StringType(), nullable=True)
    ]), nullable=True),
    StructField("stream_id", StringType(), nullable=True),
    StructField("platform", StringType(), nullable=True),
    StructField("event_dimensions", StructType([
        StructField("hostname", StringType(), nullable=True)
    ]), nullable=True),
    StructField("ecommerce", StructType([
        StructField("total_item_quantity", LongType(), nullable=True),
        StructField("purchase_revenue_in_usd", DoubleType(), nullable=True),
        StructField("purchase_revenue", DoubleType(), nullable=True),
        StructField("refund_value_in_usd", DoubleType(), nullable=True),
        StructField("refund_value", DoubleType(), nullable=True),
        StructField("shipping_value_in_usd", DoubleType(), nullable=True),
        StructField("shipping_value", DoubleType(), nullable=True),
        StructField("tax_value_in_usd", DoubleType(), nullable=True),
        StructField("tax_value", DoubleType(), nullable=True),
        StructField("transaction_id", StringType(), nullable=True),
        StructField("unique_items", LongType(), nullable=True)
    ]), nullable=True),
    StructField("items", ArrayType(
        StructType([
            StructField("item_id", StringType(), nullable=True),
            StructField("item_name", StringType(), nullable=True),
            StructField("item_brand", StringType(), nullable=True),
            StructField("item_variant", StringType(), nullable=True),
            StructField("item_category", StringType(), nullable=True),
            StructField("item_category2", LongType(), nullable=True),
            StructField("item_category3", LongType(), nullable=True),
            StructField("item_category4", LongType(), nullable=True),
            StructField("item_category5", LongType(), nullable=True),
            StructField("price_in_usd", LongType(), nullable=True),
            StructField("price", LongType(), nullable=True),
            StructField("quantity", LongType(), nullable=True),
            StructField("item_revenue_in_usd", LongType(), nullable=True),
            StructField("item_revenue", LongType(), nullable=True),
            StructField("item_refund_in_usd", LongType(), nullable=True),
            StructField("item_refund", LongType(), nullable=True),
            StructField("coupon", StringType(), nullable=True),
            StructField("affiliation", StringType(), nullable=True),
            StructField("location_id", LongType(), nullable=True),
            StructField("item_list_id", StringType(), nullable=True),
            StructField("item_list_name", StringType(), nullable=True),
            StructField("item_list_index", LongType(), nullable=True),
            StructField("promotion_id", StringType(), nullable=True),
            StructField("promotion_name", StringType(), nullable=True),
            StructField("creative_name", StringType(), nullable=True),
            StructField("creative_slot", LongType(), nullable=True)
        ])
    ), nullable=True),
    StructField("collected_traffic_source", StructType([
        StructField("manual_campaign_id", StringType(), nullable=True),
        StructField("manual_campaign_name", StringType(), nullable=True),
        StructField("manual_source", StringType(), nullable=True),
        StructField("manual_medium", StringType(), nullable=True),
        StructField("manual_term", StringType(), nullable=True),
        StructField("manual_content", StringType(), nullable=True),
        StructField("gclid", StringType(), nullable=True),
        StructField("dclid", StringType(), nullable=True),
        StructField("srsltid", StringType(), nullable=True)
    ]), nullable=True),
    StructField("is_active_user", BooleanType(), nullable=True),
    StructField("ua_details", StructType([
        StructField("hit_type", StringType(), nullable=True),
        StructField("hit_id", StringType(), nullable=True),
        StructField("session_id", StringType(), nullable=True),
        StructField("hit_number", LongType(), nullable=True),
        StructField("hit_time", LongType(), nullable=True),
        StructField("ecommerce_action_type", StringType(), nullable=True),
        StructField("is_interaction", BooleanType(), nullable=True),
        StructField("event_category", StringType(), nullable=True),
        StructField("event_action", StringType(), nullable=True),
        StructField("event_label", StringType(), nullable=True)
    ]), nullable=True),
    StructField("property_brand", StringType(), nullable=True),
    StructField("brand_country", StringType(), nullable=True),
    StructField("lineage_source", StringType(), nullable=True)
])




empty_df_gua = spark.createDataFrame(spark.sparkContext.emptyRDD(), gua_schema)

# Register DataFrame as a temporary view
empty_df_gua.createOrReplaceTempView("gua_events_temp")

# Use Spark SQL to create a table
spark.sql(f"""
CREATE TABLE if not exists raw.gua_events
USING DELTA
PARTITIONED BY (event_date,property_brand,brand_country)
LOCATION '{raw_adls_path}GA4/GUA/events/'
as select * from gua_events_temp
""")

## define a dict for all brands

In [None]:
# property_brand  = {309713727:'Bates US'
# ,309561205:'CatFootwear Canada'
# ,309621431:'CatFootwear DE'
# ,309599233:'CatFootwear EMEAEmerging'
# ,309613069:'CatFootwear UK'
# ,309592771:'CatFootwear US'
# ,309589213:'Chacos US'
# ,309561793:'Grasshoppers US'
# ,309607030:'Harley-DavidsonFootwear US'
# ,309616895:'HushPuppies Canada'
# ,309650146:'HushPuppies US'
# ,309619975:'Hytest US'
# ,309607124:'Keds Canada'
# ,309609458:'Keds US'
# ,309606225:'Merrell BE'
# ,309607810:'Merrell Canada'
# ,309615440:'Merrell DE'
# ,309628914:'Merrell EMEAEmerging'
# ,309620743:'Merrell ES'
# ,309579645:'Merrell FR'
# ,309621834:'Merrell NL'
# ,309591477:'Merrell SE'
# ,309599329:'Merrell UK'
# ,309592118:'OnlineShoes US'
# ,309626252:'Prokeds US'
# ,310709711:'Saucony AT'
# ,309600562:'Saucony BE'
# ,309596198:'Saucony Canada'
# ,309624452:'Saucony DE'
# ,309643444:'Saucony EMEAEmerging'
# ,309629089:'Saucony ES'
# ,309607948:'Saucony FR'
# ,309537329:'Saucony IT'
# ,309603632:'Saucony NL'
# ,309587387:'Saucony UK'
# #,428511278:'Server-side GTM'
# ,309650574:'Sperry Canada'
# ,309632059:'Sperry US'
# ,309599656:'Wolverine Canada'
# ,309591384:'Wolverine US'
# ,297479542:'Saucony US'
# ,309617502:'Merrell US'}

property_brand  = {309713727:'Bates US'
,309561205:'CatFootwear Canada'
,309621431:'CatFootwear DE'
,309599233:'CatFootwear EMEAEmerging'
,309613069:'CatFootwear UK'
,309592771:'CatFootwear US'
,309589213:'Chacos US'
#,309561793:'Grasshoppers US' -- GUA  not available 
#,309607030:'Harley-DavidsonFootwear US' -- GUA  not available 
,309616895:'HushPuppies Canada'
,309650146:'HushPuppies US'
,309619975:'Hytest US'
,309607124:'Keds Canada'
#,309609458:'Keds US' -- GUA  not available 
,309606225:'Merrell BE'
,309607810:'Merrell Canada'
,309615440:'Merrell DE'
,309628914:'Merrell EMEAEmerging'
,309620743:'Merrell ES'
,309579645:'Merrell FR'
,309621834:'Merrell NL'
,309591477:'Merrell SE'
,309599329:'Merrell UK'
,309592118:'OnlineShoes US'
#,309626252:'Prokeds US' -- GUA  not available 
,310709711:'Saucony AT'
,309600562:'Saucony BE'
,309596198:'Saucony Canada'
,309624452:'Saucony DE'
,309643444:'Saucony EMEAEmerging'
,309629089:'Saucony ES'
,309607948:'Saucony FR'
,309537329:'Saucony IT'
,309603632:'Saucony NL'
,309587387:'Saucony UK'
#,428511278:'Server-side GTM' -- GA4 and GUA  not available 
,309650574:'Sperry Canada'
,309632059:'Sperry US'
,309599656:'Wolverine Canada'
,309591384:'Wolverine US'
,297479542:'Saucony US'
,309617502:'Merrell US'}

In [None]:
# l_schemata_gua = ['analytics_297479542'
# ,'analytics_309537329'
# ,'analytics_309561205'
# ,'analytics_309579645'
# ,'analytics_309587387'
# ,'analytics_309589213'
# ,'analytics_309591384'
# ,'analytics_309591477'
# ,'analytics_309592118'
# ,'analytics_309592771'
# ,'analytics_309596198'
# ,'analytics_309599233'
# ,'analytics_309599329'
# ,'analytics_309599656'
# ,'analytics_309600562'
# ,'analytics_309603632'
# ,'analytics_309606225'
# ,'analytics_309607124'
# ,'analytics_309607810'
# ,'analytics_309607948'
# ,'analytics_309613069'
# ,'analytics_309615440'
# ,'analytics_309616895'
# ,'analytics_309617502'
# ,'analytics_309619975'
# ,'analytics_309620743'
# ,'analytics_309621431'
# ,'analytics_309621834'
# ,'analytics_309624452'
# ,'analytics_309628914'
# ,'analytics_309629089'
# ,'analytics_309632059'
# ,'analytics_309643444'
# ,'analytics_309650146'
# ,'analytics_309650574'
# ,'analytics_309713727'
# ,'analytics_310709711'
# ,'analytics_312016432'
# ,'analytics_312455238']

# l2 = property_brand.keys()
# l2 = [297479542, 309537329, 309561205, 309561793, 309579645, 309587387
# , 309589213, 309591384, 309591477, 309592118, 309592771, 309596198, 309599233
# , 309599329, 309599656, 309600562, 309603632, 309606225, 309607030, 309607124
# , 309607810, 309607948, 309609458, 309613069, 309615440, 309616895, 309617502
# , 309619975, 309620743, 309621431, 309621834, 309624452, 309626252, 309628914
# , 309629089, 309632059, 309643444, 309650146, 309650574, 309713727, 310709711]
# l_schemata_gua = [int(x.split('_')[1]) for x in l_schemata_gua]
# #print(sorted(l_schemata))
# l_schemata_gua = [297479542, 309537329, 309561205, 309579645, 309587387, 309589213, 309591384
# , 309591477, 309592118, 309592771, 309596198, 309599233, 309599329, 309599656, 309600562
# , 309603632, 309606225, 309607124, 309607810, 309607948, 309613069, 309615440, 309616895
# , 309617502, 309619975, 309620743, 309621431, 309621834, 309624452, 309628914, 309629089
# , 309632059, 309643444, 309650146, 309650574, 309713727, 310709711, 312016432, 312455238]
# #print('aa',l_schemata)
# #print(sorted(l2))
# print(set(l_schemata_gua)-set(l2))#{312016432, 312455238}
# print(set(l2)-set(l_schemata_gua))# {309561793, 309609458, 309626252, 309607030}

In [None]:
project = "ga360-connection-267115"

token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary  
#token_library.getSecret("kv-name", "secret-name", "linked-service")  
ga4_credentials = token_library.getSecret(kv_name, "GA4-credentials", "ls_kv_adap")  
print(ga4_credentials)

In [None]:

spark.conf.set("viewsEnabled","true")
#spark.conf.set("materializationDataset","ga360-connection-267115.analytics_297479542.events_20240201")
spark.conf.set("materializationDataset","analytics_297479542")
def get_property_partitions(p_property_id, p_gua_flag): 
    print('get partitions for property::',p_property_id, 'p_gua_flag::',p_gua_flag)
    property_date_tup_list = []
    if p_gua_flag == 'Y':
        yr = '<20230205'
        table_name = " (table_name like 'events_from_ua%' )" 
        projection1 = "cast(split(table_name,'_')[OFFSET(3)] as INT64)"
    else:
        yr = '>20230204' 
        table_name = " (table_name like 'events_2023%' or table_name like 'events_2024%')" 
        projection1 = "cast(split(table_name,'_')[OFFSET(1)] as INT64)"
    print('yr::',yr)           
    df_info_schema = spark.read.format("bigquery")\
        .option("credentials",ga4_credentials)\
        .option("parentProject",'ga360-connection-267115')\
        .load(f"""select {projection1} as partition_no, * 
            from `ga360-connection-267115.analytics_{p_property_id}.INFORMATION_SCHEMA.TABLES`
            where table_schema = 'analytics_{p_property_id}'   
            and {table_name}     
            and {projection1}{yr}
       """)
    #display(df_info_schema)   
    for j in df_info_schema.collect():
        property_date_tup_list.append((j.table_name,p_property_id))
    return  property_date_tup_list    


## test the spark read of GA4 data - project_id.dataset_id.table_name

In [None]:
table = f'ga360-connection-267115.analytics_297479542.events_20240501'
df = spark.read.format("bigquery")\
    .option("parentProject",project)\
    .option("credentials",ga4_credentials)\
    .option("table", table)\
    .load()
   
display(df)  
#missing columns for some tables:: collected_traffic_source, is_active_user
#  .load(f"""SELECT 
#   event_date,
#   event_timestamp,
#   event_name,
#   event_params,
#   event_previous_timestamp,
#   event_value_in_usd,
#   event_bundle_sequence_id,
#   event_server_timestamp_offset,
#   user_id,
#   user_pseudo_id,
#   privacy_info,
#   user_properties,
#   user_first_touch_timestamp,
#   user_ltv,
#   device,
#   geo,
#   app_info,
#   traffic_source,
#   stream_id,
#   platform,
#   event_dimensions,
#   ecommerce,
#   items
#   --collected_traffic_source,
#  -- is_active_user
# FROM 
#   `ga360-connection-267115.analytics_297479542.events_20230205`
# """)
#.option("table", table)\  

In [None]:

ga4_or_gau = 'GA4'
#ga4_or_gau = 'GA4/history'
lineage_source = 'GA4'
#lineage_source = 'GUA'
def ingest_bq_data(p_table_name,p_property_id):
  try:
    print('p_table_name::',p_table_name,'p_property_id::',p_property_id)

    brand = property_brand[p_property_id].split(' ')[0]
    brand_country = property_brand[p_property_id].split(' ')[-1]
    #print("part_name::",part_name,"brand::",brand)
    table = f'{project}.analytics_{p_property_id}.{p_table_name}'
    df = spark.read.format("bigquery") \
    .option("parentProject",project)\
    .option("table", table) \
    .option("credentials",ga4_credentials) \
    .load()
    #.where("event_name IN ('view_item', 'add_to_cart') ")
    df = df.withColumn('property_brand',lit(brand))\
           .withColumn('brand_country',lit(brand_country))\
           .withColumn('lineage_source',lit(lineage_source))
    #df_all_ingest.append(df)
    df.repartition('event_date','property_brand','brand_country')\
    .write.format("delta")\
    .mode("append")\
    .option("path",f"{raw_adls_path}{ga4_or_gau}/events")\
    .option("mergeSchema", "true")\
    .partitionBy('event_date','property_brand','brand_country')\
    .save()
  except Exception as e:
    print('Other exception::','p_table_name::',p_table_name,'p_property_id::',p_property_id,'::',str(e)) 
    raise 


## create empty table first for GA4

In [None]:
# ## create Merrell sample dataset
# sample_table = f'{project}.analytics_309617502.events_20231108'
# brand_sample = property_brand[309617502]
cnt_data =spark.sql(f'select count(*) from delta.`{raw_adls_path}GA4/events/`')
if cnt_data.collect()[0][0] ==0:
  ga4_or_gau = 'GA4'
  lineage_source = 'GA4'  
  print('count is 0 at raw location..creating table..')
  ingest_bq_data('events_20240501',297479542)

# sample_table = f'{project}.analytics_297479542.events_from_ua_20200203'
# brand_sample = property_brand[297479542]
# df = spark.read.format("bigquery") \
#     .option("parentProject",project)\
#     .option("table", sample_table) \
#     .option("credentials",ga4_credentials) \
#     .load()\
#     .where("event_name IN ('view_item', 'add_to_cart') ")
# df = df.withColumn('property_brand',lit(brand_sample))\
#        .withColumn('lineage_source',lit('GAU'))

# spark.sql('drop table if exists raw.ga4_events_history')
# df.repartition('event_date','property_brand')\
#             .write.mode("append").format("delta")\
#             .option("path",f"{raw_adls_path}GA4/history/events")\
#             .partitionBy('event_date','property_brand')\
#             .option("mergeSchema", "true")\
#             .saveAsTable('raw.ga4_events_history')







## create an empty table for events_from_ua

In [None]:
ga4_or_gau = 'GA4/GUA'
lineage_source = 'GUA'
# print('GUA count is 0 at raw location..creating table..')
ingest_bq_data('events_from_ua_20230204',297479542) ##usd is in long

df_gua = spark.read.format('delta')\
.load(f'{raw_adls_path}GA4/GUA/events/')
df_gua.printSchema()
#display(df_gua)
#events_from_ua_20230108 p_property_id:: 309713727
# cnt_data =spark.sql('select count(*) from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/GUA/events/`')#if cnt_data.collect()[0][0] ==0:
# if cnt_data.collect()[0][0] ==0:
#     ga4_or_gau = 'GA4/GUA'
#     lineage_source = 'GUA'
#     print('GUA count is 0 at raw location..creating table..')
#     ingest_bq_data('events_from_ua_20230204',297479542)

# MAIN Call - get data from BQ and write to ADLS

In [None]:
def load_all_data(p_gua_flag):
  if p_gua_flag == 'Y':
    ga4_or_gau = 'GA4/GUA'
    lineage_source = 'GUA'
  else:
    ga4_or_gau = 'GA4'
    lineage_source = 'GA4'  
  for i,j in property_brand.items():
    print('processing:: ',j )
    dates_list = get_property_partitions(i,p_gua_flag)
    if p_gua_flag == 'Y':
        dates_list = list(set([x for x in dates_list if not (x[0]== 'events_20230204' and x[1]==297479542)]))
    else:
        dates_list = list(set([x for x in dates_list if not (x[0]== 'events_20240501' and x[1]==297479542)]))    
    table_list = list(set(dates_list))
    #print(sorted(table_list))
    with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
            results = list(executor.map(ingest_bq_data, *zip(*table_list)))
   


## call load_all_data()

In [None]:
load_all_data('Y')

# Validate the counts

In [None]:
%%sql
-- select event_date , event_params, event_params.product_price.double_value
-- ,items.item_id
--  ,items.quantity
--  ,items.item_params
--  from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/` limit 10
--refresh table delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/`;
select event_date,property_brand,brand_country,count(*) 
from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/`
group by event_date,property_brand,brand_country;

In [None]:
%%sql
-- select count(*) from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/`
create table if not exists raw.ga4_events
USING DELTA select * from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/`

In [None]:
%%sql
--select count(*) from raw.ga4_events where event_date = 20240501
--select count(*) from raw.ga4_events where event_date between 20240421 and 20240427
--and event_name = 'session_start';

select device.category, count(distinct user_pseudo_id)  as session_count
from raw.ga4_events where event_date between 20240421 and 20240427
and event_name = 'session_start'
    group by device.category
--select distinct event_name from raw.ga4_events where event_date between 20240421 and 20240427

In [None]:
%%sql
select event_date, count(*) 
from  raw.ga4_events
group by event_date

In [None]:
%%sql
-- show create table raw.ga4_events 
-- describe detail delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/`
select property_brand, min(event_date) 
from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/history/events/`
group by property_brand

In [None]:
df_gua= spark.read.format('delta').load('abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/history/events/')
df_gua.printSchema()


# Counts Validation of GAU data

In [None]:
%%sql
-- select property_brand,brand_country,count(*) from raw.gua_events
-- where event_date = '20230204' 
-- group by property_brand, brand_country;

select count(*) from raw.gua_events 
where event_date = '20230204' and property_brand = 'Saucony'
    and brand_country = 'US'