In [1]:
import pyspark.sql.functions as F
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")

In [3]:
%run /utils/common_functions

In [4]:
spark.conf.set("spark.sql.adaptive.enabled","true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled","true")
spark.conf.set("spark.databricks.adaptive.autoOptimizeShuffle.enabled","true")

## get dates for Full load - comment after the first full load

In [4]:
## full_load_dates = """
##      SELECT DISTINCT  dd.fiscalweek as fiscalweek, dd.weekbegindate as weekbegindate, dd.weekenddate as weekenddate
##     FROM report.DateDim dd
##    where daydate between '2023-02-05' and '2024-05-28'
   
# """
# df_date = spark.read.format("jdbc")\
# .option("driver", jdbcDriver)\
# .option("url", jdbcUrl)\
# .option("query",full_load_dates)\
# .option("user", jdbcUsername)\
# .option("password", jdbcPassword)\
# .load()
# gua_dates_range = df_date.collect()
# fiscal_week = gua_dates_range[0][0]
# week_begin_date = gua_dates_range[0][1]
# week_end_date = gua_dates_range[0][2]
# print("fiscal_week::",fiscal_week,"week_begin_date::",week_begin_date,"week_end_date::",week_end_date) 
# display(df_date)
# df_date2 = df_date.createOrReplaceTempView('DateDim')
# df_date.printSchema()



## incremental load - if no date is passed, get the dates based on current day.

In [5]:
#if needed to give::
#date_ranges = [('2024-05-26','2024-06-01'),('2024-05-05','2024-05-11')]
#date_ranges = [('2024-05-26','2024-06-01')]
date_ranges = []

print("param date_ranges::", date_ranges)
if not date_ranges or len(date_ranges)==0:
    print('dates_to_load is null - get weekbegindate and weekenddate based on current day')
    df_max_weekend_date = spark.sql('select max(weekenddate) as max_weekenddate from lakedb_gold.ga4_weekly_stats')
    df_max_weekend_date = df_max_weekend_date.collect()[0].max_weekenddate
    print(df_max_weekend_date)
    query = f"""
   		  SELECT DISTINCT dd.fiscalyear,dd.fiscalweek, dd.weekbegindate, dd.weekenddate
     FROM report.DateDim dd
     JOIN (SELECT top 1  daydate,fiscalyear, fiscalweek as nextfiscalweek
         FROM report.DateDim
         WHERE DayDate > convert(varchar,cast('{df_max_weekend_date}' as date),23) 
         --and daydate <= cast(getdate() as date)
          order by daydate,fiscalyear,fiscalweek )t1 ON dd.fiscalweek = t1.nextfiscalweek AND dd.fiscalyear = t1.fiscalyear
"""
    df_date = spark.read.format("jdbc")\
    .option("driver", jdbcDriver)\
    .option("url", jdbcUrl)\
    .option("query",query)\
    .option("user", jdbcUsername)\
    .option("password", jdbcPassword)\
    .load()
    ga4_dates_range = df_date.collect()
    print(ga4_dates_range)
    # fiscal_week = ga4_dates_range[0][0]
    # week_begin_date = ga4_dates_range[0][1]
    # week_end_date = ga4_dates_range[0][2]
    # print("fiscal_week::",fiscal_week,"week_begin_date::",week_begin_date,"week_end_date::",week_end_date) 
    display(df_date)
    df_date2 = df_date.createOrReplaceTempView('DateDim') 
    df_date.printSchema() 
else:
  print('dates_to_load is not null')
  df_columns = ['weekbegindate','weekenddate']
  df_date = spark.createDataFrame(data = date_ranges, schema = df_columns ) 
  df_date = df_date.withColumn('weekbegindate',F.to_date(F.date_format(df_date.weekbegindate,'yyyy-MM-dd')))\
                   .withColumn('weekenddate',F.to_date(F.date_format(df_date.weekenddate,'yyyy-MM-dd')))   
  df_date2 = df_date.createOrReplaceTempView('DateDim')
  df_date.printSchema()
  display(df_date)
  

## Load the data using dates in DateDim table created above

In [6]:
display(spark.sql("select * from datedim"))
df_stats = spark.sql("""
with dates as ( SELECT /*+BROADCAST(DateDim)*/ DISTINCT 
 date_format(weekbegindate,'yyyyMMdd') as weekbegindate
, date_format(weekenddate,'yyyyMMdd') as weekenddate
    FROM DateDim 
  )
,t_all_data as (select * from raw.ga4_events, dates d
      where event_date between d.weekbegindate and d.weekenddate 
      and device.category != 'smart tv' )  
,t1 as (
select  property_brand
, brand_country
, device_category
, weekbegindate 
, weekenddate
, 'sessions' as metric
, count( distinct concat(a.user_pseudo_id,a.value.int_value)) as cnt
 from (select  property_brand, brand_country,device.category as device_category
 , user_pseudo_id
 , weekbegindate 
 , weekenddate
 , explode(event_params)
                from t_all_data) a
                where a.key = 'ga_session_id' 
  group by property_brand, brand_country,device_category,weekbegindate ,weekenddate) 
  , t2 as (select  property_brand
  , brand_country
  ,device.category
  ,weekbegindate 
  , weekenddate
  ,'carts'
  ,COUNT_IF(event_name = 'add_to_cart') AS adds_to_cart   
  from   t_all_data                          
  group by property_brand, brand_country,device.category,weekbegindate, weekenddate)
  ,t3 as (select  property_brand
  , brand_country
  ,device.category
  ,weekbegindate
  , weekenddate
  ,'orders'
  ,COUNT_IF(event_name IN ('purchase', 'transaction'))
from  t_all_data
                           
  group by property_brand, brand_country,device.category,weekbegindate, weekenddate)
 , t5 as (select property_brand
 , brand_country
 ,device_category
 ,weekbegindate
 ,weekenddate
 , 'quantity'
 ,SUM(items.quantity) as qty
        from (select /*+BROADCAST(d)*/ property_brand, brand_country,device.category as device_category
        , weekbegindate,weekenddate, event_name, explode(items) as items
        from t_all_data where event_name = 'purchase' ) 
                   group by property_brand, brand_country,device_category, weekbegindate,weekenddate
        )
  ,t6 as (select  /*+BROADCAST(d)*/  property_brand
 , brand_country
 ,device.category
 ,weekbegindate
 ,weekenddate
 , 'sales'
 , sum(nvl(ecommerce.purchase_revenue,0) - nvl(ecommerce.shipping_value,0) - nvl(ecommerce.tax_value,0))  as sales
    from t_all_data                          
  group by property_brand, brand_country,device.category,weekbegindate, weekenddate )      
  ,t4 as (select * from t1 
  union all
  select * from t2 
  union all
  select * from t3
  union all 
  select * from t5
  union all
  select * from t6)
  SELECT property_brand, brand_country,concat(device_category,'_',metric) as metric,nvl(cnt,0) cnt
,weekbegindate,weekenddate FROM t4""")

df_stats = df_stats.groupBy('property_brand', 'brand_country','weekbegindate','weekenddate')\
.pivot('metric')\
.agg(F.first('cnt').alias('cnt'))


#df_stats.createOrReplaceTempView('ga4_weekly_stats')
display(df_stats)
df_stats=  df_stats.repartition('weekenddate','property_brand','brand_country')\
    .write.format("delta")\
    .mode("overwrite")\
    .option("path",f"{gold_adls_path}GA4/weekly_stats")\
    .option("mergeSchema", "true")\
    .partitionBy('weekenddate','property_brand','brand_country')\
    .saveAsTable('lakedb_gold.ga4_weekly_stats')


In [None]:
%%sql
select * from lakedb_gold.ga4_weekly_stats
where 1=1
        and weekenddate= 20240601 order by property_brand,brand_country
   


In [None]:
# ,t1 as (
# select  property_brand
# , brand_country
# , device_category
# , weekbegindate 
# , weekenddate
# , 'sessions' as metric
# , count( distinct concat(a.user_pseudo_id,a.value.int_value)) as cnt
#  from (select /*+BROADCAST(d)*/ property_brand, brand_country,device.category as device_category
#  , user_pseudo_id
#  , d.weekbegindate 
#  , d.weekenddate
#  , explode(event_params)
#                 from raw.ga4_events a, dates d
#                 where event_date between d.weekbegindate and d.weekenddate) a
#                 where a.key = 'ga_session_id' 
#   group by property_brand, brand_country,device_category,weekbegindate ,weekenddate) 
#   , t2 as (select /*+BROADCAST(d)*/ property_brand
#   , brand_country
#   ,device.category
#   ,d.weekbegindate 
#   , d.weekenddate
#   ,'carts'
#   ,COUNT_IF(event_name = 'add_to_cart') AS adds_to_cart   
#   from   raw.ga4_events a, dates d
#                 where event_date between d.weekbegindate and d.weekenddate           
#   group by property_brand, brand_country,device.category,d.weekbegindate, d.weekenddate)
#   ,t3 as (select /*+BROADCAST(d)*/ property_brand
#   , brand_country
#   ,device.category
#   ,d.weekbegindate
#   , d.weekenddate
#   ,'orders'
#   ,COUNT_IF(event_name IN ('purchase', 'transaction'))
# from   raw.ga4_events a, dates d
#                 where event_date between d.weekbegindate and d.weekenddate            
#   group by property_brand, brand_country,device.category,d.weekbegindate, d.weekenddate)
#  , t5 as (select property_brand
#  , brand_country
#  ,device_category
#  ,weekbegindate
#  ,weekenddate
#  , 'quantity'
#  ,SUM(items.quantity) as qty
#         from (select /*+BROADCAST(d)*/ property_brand, brand_country,device.category as device_category
#         , d.weekbegindate,d.weekenddate, event_name, explode(items) as items
#         from raw.ga4_events a, dates d
#                 where event_date between d.weekbegindate and d.weekenddate   and  event_name = 'purchase' ) 
#                    group by property_brand, brand_country,device_category, weekbegindate,weekenddate
#         )
#   ,t6 as (select  /*+BROADCAST(d)*/  property_brand
#  , brand_country
#  ,device.category
#  ,weekbegindate
#  ,weekenddate
#  , 'sales'
#  , sum(nvl(ecommerce.purchase_revenue,0) - nvl(ecommerce.shipping_value,0) - nvl(ecommerce.tax_value,0))  as sales
#     from raw.ga4_events a, dates d
#                 where event_date between d.weekbegindate and d.weekenddate            
#   group by property_brand, brand_country,device.category,d.weekbegindate, d.weekenddate )      
#   ,t4 as (select * from t1 
#   union all
#   select * from t2 
#   union all
#   select * from t3
#   union all 
#   select * from t5
#   union all
#   select * from t6)
#   SELECT property_brand, brand_country,concat(device_category,'_',metric) as metric,cnt
# ,weekbegindate,weekenddate FROM t4 where  device_category != 'smart tv'""")

# df_stats = df_stats.groupBy('property_brand', 'brand_country','weekbegindate','weekenddate')\
# .pivot('metric')\
# .agg(F.first('cnt').alias('cnt'))

# #df_stats.createOrReplaceTempView('ga4_weekly_stats')
# display(df_stats)
# df_stats=  df_stats.repartition('weekenddate','property_brand','brand_country')\
#     .write.format("delta")\
#     .mode("overwrite")\
#     .option("path",f"{gold_adls_path}GA4/weekly_stats")\
#     .option("mergeSchema", "true")\
#     .partitionBy('weekenddate','property_brand','brand_country')\
#     .saveAsTable('lakedb_gold.ga4_weekly_stats')


In [None]:
%%sql
select weekenddate,count(*)
 from lakedb_gold.ga4_weekly_stats
 group by weekenddate
