## Revision History

In [None]:
# Change_date         revision_number     change_description                           author
# 02/08/2024          1                   initial check-in                             Kranthi
# 05/03/2024          2                    made it dynamic to load for GUA and GA4      Kranthi
## first load for one date - then generate the schema using printSchema- create its structype for all the fields
## then start loading for all partitions. Create static schema first and then load all partitions in parallel. 
##The following things were tried and gave error::
## 1. with out schema defination, with out empty table table - load all partitions in parallel - protocol version error
## 2. first load 20230205 and then load all partitions in parllel- metadata changed by concurrent update error
## 3. create empty table with schema defined - then load all partitions in parallel - metadata changed by concurrent update error.
## 4. create empty table with schema defined - load single partition 20240501 - then load all other partitions - worked successfully
## Similarly for GUA , create the schema and load single 20230204 - then load all other partitions.

In [None]:
import concurrent.futures
from delta import *
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, LongType, DoubleType, BooleanType, MapType,IntegerType
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as F
import json
import base64
from datetime import datetime,timedelta
from time import sleep
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")

In [None]:
%run /utils/common_functions

## define a dict for all brands

In [None]:
property_brand  = {309713727:'Bates US'
,309561205:'CatFootwear Canada'
,309621431:'CatFootwear DE'
,309599233:'CatFootwear EMEAEmerging'
,309613069:'CatFootwear UK'
,309592771:'CatFootwear US'
,309589213:'Chacos US'
,309616895:'HushPuppies Canada'
,309650146:'HushPuppies US'
,309619975:'Hytest US'
,309607124:'Keds Canada'
,309606225:'Merrell BE'
,309607810:'Merrell Canada'
,309615440:'Merrell DE'
,309628914:'Merrell EMEAEmerging'
,309620743:'Merrell ES'
,309579645:'Merrell FR'
,309621834:'Merrell NL'
,309591477:'Merrell SE'
,309599329:'Merrell UK'
,309592118:'OnlineShoes US'
,310709711:'Saucony AT'
,309600562:'Saucony BE'
,309596198:'Saucony Canada'
,309624452:'Saucony DE'
,309643444:'Saucony EMEAEmerging'
,309629089:'Saucony ES'
,309607948:'Saucony FR'
,309537329:'Saucony IT'
,309603632:'Saucony NL'
,309587387:'Saucony UK'
,309650574:'Sperry Canada'
,309632059:'Sperry US'
,309599656:'Wolverine Canada'
,309591384:'Wolverine US'
,309617502:'Merrell US'
}

In [None]:
project = "ga360-connection-267115"

token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary  
#token_library.getSecret("kv-name", "secret-name", "linked-service")  
ga4_credentials = token_library.getSecret(kv_name, "GA4-credentials", "ls_kv_adap")  
print(ga4_credentials)

In [None]:

spark.conf.set("viewsEnabled","true")
#spark.conf.set("materializationDataset","ga360-connection-267115.analytics_297479542.events_20240201")
spark.conf.set("materializationDataset","analytics_297479542")
def get_property_partitions(p_property_id): 
    print('get partitions for property::',p_property_id)
    property_date_tup_list = []
    yr = '>20231108' 
    table_name = " (table_name like 'events_2023%' or table_name like 'events_2024%')" 
    projection1 = "cast(split(table_name,'_')[OFFSET(1)] as INT64)"
    print('yr::',yr)           
    df_info_schema = spark.read.format("bigquery")\
        .option("credentials",ga4_credentials)\
        .option("parentProject",'ga360-connection-267115')\
        .load(f"""select {projection1} as partition_no, * 
            from `ga360-connection-267115.analytics_{p_property_id}.INFORMATION_SCHEMA.TABLES`
            where table_schema = 'analytics_{p_property_id}'   
            and {table_name}     
            and {projection1}{yr}
       """)
    #display(df_info_schema)   
    for j in df_info_schema.collect():
        property_date_tup_list.append((j.table_name,p_property_id))
    return  property_date_tup_list    


In [None]:

ga4_or_gau = 'GA4'
#ga4_or_gau = 'GA4/history'
lineage_source = 'GA4'
#lineage_source = 'GUA'
def ingest_bq_data(p_table_name,p_property_id):
  try:
    print('p_table_name::',p_table_name,'p_property_id::',p_property_id)

    brand = property_brand[p_property_id].split(' ')[0]
    brand_country = property_brand[p_property_id].split(' ')[-1]
    #print("part_name::",part_name,"brand::",brand)
    table = f'{project}.analytics_{p_property_id}.{p_table_name}'
    df = spark.read.format("bigquery") \
    .option("parentProject",project)\
    .option("table", table) \
    .option("credentials",ga4_credentials) \
    .load()
    #.where("event_name IN ('view_item', 'add_to_cart') ")
    df = df.withColumn('property_brand',lit(brand))\
           .withColumn('brand_country',lit(brand_country))\
           .withColumn('lineage_source',lit(lineage_source))
    #df_all_ingest.append(df)
    df.repartition('event_date','property_brand','brand_country')\
    .write.format("delta")\
    .mode("append")\
    .option("path",f"{raw_adls_path}{ga4_or_gau}/events")\
    .option("mergeSchema", "true")\
    .partitionBy('event_date','property_brand','brand_country')\
    .save()
  except Exception as e:
    print('Other exception::','p_table_name::',p_table_name,'p_property_id::',p_property_id,'::',str(e)) 
    raise 


# MAIN Call - get data from BQ and write to ADLS

In [None]:
def load_all_data():
  for i,j in property_brand.items():
    print('processing:: ',j )
    dates_list = get_property_partitions(i)
    dates_list = list(set([x for x in dates_list ]))    
    table_list = list(set(dates_list))
    #print(sorted(table_list))
    with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
            results = list(executor.map(ingest_bq_data, *zip(*table_list)))
   


## call load_all_data()

In [None]:
load_all_data()

# Validate the counts

In [None]:
%%sql
-- select event_date , event_params, event_params.product_price.double_value
-- ,items.item_id
--  ,items.quantity
--  ,items.item_params
--  from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/` limit 10
--refresh table delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/`;
select event_date,property_brand,brand_country,count(*) 
from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/`
group by event_date,property_brand,brand_country;

In [None]:
%%sql
-- select count(*) from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/`
create table if not exists raw.ga4_events
USING DELTA select * from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/`

In [None]:
%%sql
--select count(*) from raw.ga4_events where event_date = 20240501
--select count(*) from raw.ga4_events where event_date between 20240421 and 20240427
--and event_name = 'session_start';

select device.category, count(distinct user_pseudo_id)  as session_count
from raw.ga4_events where event_date between 20240421 and 20240427
and event_name = 'session_start'
    group by device.category
--select distinct event_name from raw.ga4_events where event_date between 20240421 and 20240427

In [None]:
%%sql
select  count(*) 
from  raw.ga4_events
groue

In [None]:
%%sql
-- show create table raw.ga4_events 
-- describe detail delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events/`
select property_brand, min(event_date) 
from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/history/events/`
group by property_brand

In [None]:
df_gua= spark.read.format('delta').load('abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/history/events/')
df_gua.printSchema()


In [None]:
%%sql
-- select count(*), event_date from raw.ga4_events
-- where property_brand =  'Merrell'
--     group by event_date; -- 423,365,870
-- select count(*) from raw.ga4_events
-- where property_brand =  'Merrell';

select count(*), property_brand, brand_country
 from raw.ga4_events
group by property_brand, brand_country;
  

# Counts Validation of GAU data

In [None]:
%%sql
-- select property_brand,brand_country,count(*) from raw.gua_events
-- where event_date = '20230204' 
-- group by property_brand, brand_country;

select count(*) from raw.gua_events 
where event_date = '20230204' and property_brand = 'Saucony'
    and brand_country = 'US'