## Revision History

In [None]:
# Change_date         revision_number     change_description                           author
# 02/08/2024          1                   initial check-in                             Kranthi


In [None]:
import concurrent.futures
from delta import *
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, MapType, ArrayType
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as F
import json
import base64
from datetime import datetime,timedelta
from time import sleep
from pytz import timezone
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")

In [None]:
%run /utils/common_functions

# get secret to connect to BQ from Azure KV

In [None]:
project = "ga360-connection-267115"
#property_brand = {'Saucony':297479542, 'Merrell':309617502}

exclusion_list = {309713727:'Bates US'
,309561793:'Grasshoppers US'
,309607030:'Harley-DavidsonFootwear US'
,309650146:'HushPuppies US' 
,309609458:'Keds US' 
,309607124:'Keds Canada'
,309626252:'Prokeds US' 
,428511278:'Server-side GTM'
,309650574:'Sperry Canada' 
,309632059:'Sperry US' 
,309599656:'Wolverine Canada'
,309616895:'HushPuppies Canada' }

exclusion_list = ','.join(["'"+j+"'" for j in exclusion_list.values()])
print("exclusion_list::",exclusion_list)

property_brand  = {
309561205:'CatFootwear Canada'
,309621431:'CatFootwear DE'
,309599233:'CatFootwear EMEAEmerging'
,309613069:'CatFootwear UK'
,309592771:'CatFootwear US'
,309589213:'Chacos US'
,309619975:'Hytest US'
,309606225:'Merrell BE'
,309607810:'Merrell Canada'
,309615440:'Merrell DE'
,309628914:'Merrell EMEAEmerging'
,309620743:'Merrell ES'
,309579645:'Merrell FR'
,309621834:'Merrell NL'
,309591477:'Merrell SE'
,309599329:'Merrell UK'
,309592118:'OnlineShoes US'
,310709711:'Saucony AT'
,309600562:'Saucony BE'
,309596198:'Saucony Canada'
,309624452:'Saucony DE'
,309643444:'Saucony EMEAEmerging'
,309629089:'Saucony ES'
,309607948:'Saucony FR'
,309537329:'Saucony IT'
,309603632:'Saucony NL'
,309587387:'Saucony UK'
,309591384:'Wolverine US'
,297479542:'Saucony US'
,309617502:'Merrell US'}

pb_lkp = {j:i for i,j in property_brand.items()}

token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary  
#token_library.getSecret("kv-name", "secret-name", "linked-service")  
ga4_credentials = token_library.getSecret(kv_name, "GA4-credentials", "ls_kv_adap")  
print(ga4_credentials)

# daily load/Incremental Load -parameter should be [(events_*****,propertyid)] -list of tuples

In [None]:
#daily load/Incremental Load -parameter should be [(events_*****,propertyid)] -list of tuples
dates_to_load = []
print("len of dates_to_load",len(dates_to_load))
if not dates_to_load:
    print('dates_to_load is null')
else:
  print('dates_to_load is not null')  

## if input date is NULL  - get the dates between max date and today 

In [None]:

# if isinstance(dates_to_load, str) and dates_to_load !='':
#   print('dates_to_load is string type- convert to list')
#   dates_to_load = eval(dates_to_load)
# elif  dates_to_load is None :
#   print('dates_to_load is None')  
#   dates_to_load = []   


 
if (len(dates_to_load) == 0 or dates_to_load == '' or dates_to_load is None):
    df = spark.sql(f"""select property_brand,brand_country, max(event_date) as event_date 
    from delta.`{raw_adls_path}GA4/events`  
    where lineage_source = 'GA4' 
    and concat(property_brand,' ',brand_country) NOT IN ({exclusion_list})
    group by property_brand,brand_country""")
    watermark_vals = [(j.event_date,j.property_brand, j.brand_country) for j in  df.collect()]
    print("watermark_vals/existing max dates::",watermark_vals)
    for j in watermark_vals:
        end_dt_str = datetime.now(timezone('US/Eastern')).strftime("%Y%m%d")
        start_dt_str = j[0]  
        start_dt = datetime.strptime(start_dt_str,'%Y%m%d')
        end_dt =  datetime.strptime(end_dt_str,'%Y%m%d')
        current_dt = start_dt+timedelta(days=1)
        #print(j[1],start_dt_str,end_dt_str)
        while current_dt < end_dt:
            dates_to_load.append(('events_'+current_dt.strftime('%Y%m%d'),pb_lkp[j[1]+' '+j[2]]))
            current_dt +=timedelta(days=1)
    print('dates_to_load IS NULL, get the dates between max date and today::',dates_to_load)
else:
    print("dates_to_load is not null::",dates_to_load)


# ingest GA4 data using BQ connector

In [None]:
def ingest_bq_data(p_table_name,p_property_id):
  try:
    print('p_table_name::',p_table_name,'p_property_id::',p_property_id)
    tab_name = p_table_name.split('_')[0]
    part_name = p_table_name.split('_')[1]
    brand = property_brand[p_property_id].split(' ')[0]
    brand_country = property_brand[p_property_id].split(' ')[-1]
    print('part_name::',part_name,'table_name::',tab_name,'brand::',brand, 'brand_country::',brand_country)
    table = f'{project}.analytics_{p_property_id}.{p_table_name}'
    df = spark.read.format("bigquery") \
    .option("parentProject",project)\
    .option("table", table) \
    .option("credentials",ga4_credentials) \
    .load()
    #.where("event_name IN ('view_item', 'add_to_cart') ")
    df = df.withColumn('property_brand',lit(brand))\
           .withColumn('brand_country',lit(brand_country))\
           .withColumn('lineage_source',lit('GA4'))

    #df_all_ingest.append(df)
    df.repartition('event_date','property_brand','brand_country')\
    .write.format("delta")\
    .mode("overwrite")\
    .option("path",f"{raw_adls_path}GA4/events")\
    .option("mergeSchema", "true")\
    .option("replaceWhere", f"event_date={part_name} and property_brand='{brand}' and brand_country='{brand_country}'")\
    .partitionBy('event_date','property_brand','brand_country')\
    .save()
  except Exception as e:

    
    if 'NullPointerException' in str(e):
      print('NullPointerException for ::',p_table_name,'property_id::',p_property_id,'brand::',brand,'country::',brand_country,'part_name::',part_name,'::') 
    else:
      print('Other exception::','p_table_name::',p_table_name,'property_id::',p_property_id,'brand::',brand,'country::',brand_country,'part_name::',part_name,'::') 
      raise  
      
   


In [None]:
table_list = list(set(dates_to_load))
print("len(table_list)::",len(table_list))
for j in table_list:
  ingest_bq_data(j[0],j[1])      


# Validate data

In [None]:
if dates_to_load: 
    unzip_dates, unzip_brands = map(list , zip(*dates_to_load))
    print(unzip_dates,"::",unzip_brands)
    dates_filter = ','.join([j.split('_')[1] for j in unzip_dates])
    print('dates_filter::',dates_filter)
    df = spark.sql(f"""select event_date,property_brand,brand_country,count(*) cnt from raw.ga4_events
    where event_date IN ({dates_filter}) group by event_date,property_brand, brand_country""")
    display(df)
else:
    print('no dates to load')    

## Validate ADLS vs BQ counts - send an alert of the discrepancy

In [None]:
# spark.conf.set("viewsEnabled","true")
# #spark.conf.set("materializationDataset","ga360-connection-267115.analytics_297479542.events_20240201")
# spark.conf.set("materializationDataset","analytics_297479542")
# df = spark.read.format("bigquery") \
#     .option("parentProject",project)\
#     .option("credentials",ga4_credentials)\
#     .load("""select count(*), '309713727-Bates US'
# from `ga360-connection-267115.analytics_309713727.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309561205-CatFootwear Canada'
# from `ga360-connection-267115.analytics_309561205.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309621431-CatFootwear DE'
# from `ga360-connection-267115.analytics_309621431.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309599233-CatFootwear EMEA Emerging'
# from `ga360-connection-267115.analytics_309599233.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-CatFootwear UK'
# from `ga360-connection-267115.analytics_309613069.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Catfottwear US'
# from `ga360-connection-267115.analytics_309592771.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Chacos US'
# from `ga360-connection-267115.analytics_309589213.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Hushpuppies Canada'
# from `ga360-connection-267115.analytics_309616895.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Hushpuppies US'
# from `ga360-connection-267115.analytics_309650146.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Hytest US'
# from `ga360-connection-267115.analytics_309619975.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Keds canada'
# from `ga360-connection-267115.analytics_309607124.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Merrell BE'
# from `ga360-connection-267115.analytics_309606225.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Merrell Canada'
# from `ga360-connection-267115.analytics_309607810.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Merrell DE'
# from `ga360-connection-267115.analytics_309615440.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Merrell EMEAEmerging'
# from `ga360-connection-267115.analytics_309628914.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Merrell ES'
# from `ga360-connection-267115.analytics_309620743.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Merrell FR'
# from `ga360-connection-267115.analytics_309579645.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Merrell NL'
# from `ga360-connection-267115.analytics_309621834.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Merrell SE'
# from `ga360-connection-267115.analytics_309591477.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Merrell UK'
# from `ga360-connection-267115.analytics_309599329.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Onlineshoes US'
# from `ga360-connection-267115.analytics_309592118.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony AT'
# from `ga360-connection-267115.analytics_310709711.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony BE'
# from `ga360-connection-267115.analytics_309600562.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony Canada'
# from `ga360-connection-267115.analytics_309596198.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony DE'
# from `ga360-connection-267115.analytics_309624452.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony EMEAEmerging'
# from `ga360-connection-267115.analytics_309643444.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony ES'
# from `ga360-connection-267115.analytics_309629089.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony BE'
# from `ga360-connection-267115.analytics_309600562.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony FR'
# from `ga360-connection-267115.analytics_309607948.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony IT'
# from `ga360-connection-267115.analytics_309537329.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony NL'
# from `ga360-connection-267115.analytics_309603632.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Saucony UK'
# from `ga360-connection-267115.analytics_309587387.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Sperry Canada'
# from `ga360-connection-267115.analytics_309650574.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Sperry US'
# from `ga360-connection-267115.analytics_309632059.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Wolverine Canada'
# from `ga360-connection-267115.analytics_309599656.events_*`
# where _table_suffix >='20231109'
# union all
#  select count(*), '309713727-Wolverine US'
# from `ga360-connection-267115.analytics_309591384.events_*`
# where _table_suffix >='20231109'
# union all
# select count(*), '309617502-Merrell US'
# from `ga360-connection-267115.analytics_309617502.events_*`
# where _table_suffix >='20231109'
# union all
# select count(*), '297479542-Saucony US'
# from `ga360-connection-267115.analytics_297479542.events_*`
# where _table_suffix  >='20231109'""")
# display(df)

In [None]:
%%sql
select count(*), property_brand, brand_country
 from raw.ga4_events
group by property_brand, brand_country
order by property_brand;