In [28]:
import pyspark.sql.functions as F
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")

In [29]:
%run /utils/common_functions

In [30]:
spark.conf.set("spark.sql.adaptive.enabled","true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled","true")
spark.conf.set("spark.databricks.adaptive.autoOptimizeShuffle.enabled","true")

In [7]:
%%sql
select distinct stream_name from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/events`


## get dates for Full load - comment after the first full load

In [31]:
full_load_dates = """
     SELECT DISTINCT  dd.fiscalweek as fiscalweek, dd.weekbegindate as weekbegindate, dd.weekenddate as weekenddate
    FROM report.DateDim dd
   where daydate between '2023-02-05' and '2023-11-11'
   
"""
df_date = spark.read.format("jdbc")\
.option("driver", jdbcDriver)\
.option("url", jdbcUrl)\
.option("query",full_load_dates)\
.option("user", jdbcUsername)\
.option("password", jdbcPassword)\
.load()
gua_dates_range = df_date.collect()
fiscal_week = gua_dates_range[0][0]
week_begin_date = gua_dates_range[0][1]
week_end_date = gua_dates_range[0][2]
print("fiscal_week::",fiscal_week,"week_begin_date::",week_begin_date,"week_end_date::",week_end_date) 
display(df_date)
df_date2 = df_date.createOrReplaceTempView('DateDim')
df_date.printSchema()



## Create aggregated carts table for all weekends between Feb 05 2023 - Nov 08 2023

In [14]:
%%sql

create table lakedb_gold.ga4_backfill_aggregate_events_carts 
using delta 
partitioned by (weekenddate)
location 'abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/aggregate_events_carts'
select d.weekbegindate
, d.weekenddate
, device_category
, stream_name
,'carts' as metric 
, round(sum(event_count),3) as carts
 from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/events/` a
 inner join datedim d on a.date between d.weekbegindate and d.weekenddate
where event_name IN ('add_to_cart')
group by  d.weekbegindate
, d.weekenddate
, device_category
, stream_name

In [32]:
%%sql
select * from lakedb_gold.ga4_backfill_aggregate_events_carts; 

## Create aggregated orders table for all weekends between Feb 05 2023 - Nov 08 2023

In [11]:
%%sql
create table lakedb_gold.ga4_backfill_aggregate_events_orders 
using delta 
partitioned by (weekenddate)
location 'abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/aggregate_events_orders'
select d.weekbegindate
, d.weekenddate
, device_category
, stream_name
,'orders' as metric 
, round(sum(event_count),3) as orders
 from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/events/` a
 inner join datedim d on a.date between d.weekbegindate and d.weekenddate
where event_name IN ('purchase', 'transaction')
group by  d.weekbegindate
, d.weekenddate
, device_category
, stream_name



In [13]:
%%sql
select * from lakedb_gold.ga4_backfill_aggregate_events_orders;

## Create aggregated sessions table for all weekends between Feb 05 2023 - Nov 08 2023

In [22]:
%%sql
create table lakedb_gold.ga4_backfill_aggregate_sessions
using delta 
partitioned by (weekenddate)
location 'abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/aggregate_sessions'
as 
select  d.weekbegindate
, d.weekenddate
, device_category
, stream_name
,'sessions' as metric
, sum(sessions) as sessions
 from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/acquisition_sessions/` a
 inner join datedim d on a.date between d.weekbegindate and d.weekenddate
group by d.weekbegindate
, d.weekenddate
, device_category
, stream_name

In [25]:
%%sql
select * from lakedb_gold.ga4_backfill_aggregate_sessions

## Create aggregated qty table for all weekends between Feb 05 2023 - Nov 08 2023

In [20]:
%%sql
create table lakedb_gold.ga4_backfill_aggregate_qty
using delta 
partitioned by (weekenddate)
location 'abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/aggregate_qty'
as 
select   d.weekbegindate
, d.weekenddate
, device_category
, stream_name
, 'quantity' as metric
, sum(items_purchased) as qty
 from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/transaction_qty/` a
inner join datedim d on a.date between d.weekbegindate and d.weekenddate
group by d.weekbegindate
, d.weekenddate
, device_category
, stream_name

In [26]:
%%sql
select * from lakedb_gold.ga4_backfill_aggregate_qty;

## Create aggregated sales table for all weekends between Feb 05 2023 - Nov 08 2023


In [24]:
%%sql
create table lakedb_gold.ga4_backfill_aggregate_sales
using delta 
partitioned by (weekenddate)
location 'abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/aggregate_sales'
as 
select d.weekbegindate
, d.weekenddate
, device_category
, stream_name
, 'sales' as metric
, round(sum(total_revenue-tax_amount-shipping_amount),3) as sales
 from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/transactions/` a
 inner join datedim d on a.date between d.weekbegindate and d.weekenddate
 group by d.weekbegindate
, d.weekenddate
, device_category
, stream_name

In [34]:
%%sql
select * from lakedb_gold.ga4_backfill_aggregate_sales
-- select distinct stream_name from lakedb_gold.ga4_backfill_aggregate_sales

In [None]:
 df_all_tabs = spark.sql(""" with all_dates as (select weekbegindate, weekenddate from datedim)
 ,t_category as (select 'mobile' as device_category 
   union all
   select 'tablet'
   union all
   select 'desktop')
 ,t_brands as (select distinct stream_name from lakedb_gold.ga4_backfill_aggregate_sales )  
 ,all_data as (select * from all_dates, t_category, t_brands)
 ,t4 as (select * from lakedb_gold.ga4_backfill_aggregate_events_carts
union all
select * from lakedb_gold.ga4_backfill_aggregate_events_orders
union all
select * from lakedb_gold.ga4_backfill_aggregate_sessions
union all
select * from lakedb_gold.ga4_backfill_aggregate_qty
union all
select * from lakedb_gold.ga4_backfill_aggregate_sales)
  SELECT property_brand, brand_country,concat(device_category,'_',metric) as metric,nvl(cnt,0) cnt
,weekbegindate,weekenddate FROM t4""")

df_stats = df_stats.groupBy('property_brand', 'brand_country','weekbegindate','weekenddate')\
.pivot('metric')\
.agg(F.first('cnt').alias('cnt'))


#df_stats.createOrReplaceTempView('ga4_weekly_stats')
display(df_stats)
df_stats=  df_stats.repartition('weekenddate','property_brand','brand_country')\
    .write.format("delta")\
    .mode("overwrite")\
    .option("path",f"{gold_adls_path}GA4/weekly_stats")\
    .option("mergeSchema", "true")\
    .partitionBy('weekenddate','property_brand','brand_country')\
    .saveAsTable('lakedb_gold.ga4_weekly_stats')

## Load the data using dates in DateDim table created above

In [6]:
# display(spark.sql("select * from datedim"))
# df_stats = spark.sql("""
# with t_streams as (select 'mobile' as device_category 
#   union all
#   select 'tablet'
#   union all
#   select 'desktop')
# , all_dates as (select * from datedim)
# ,all_data as (select * from t_streams,all_dates)
# , t1_carts as (select date
# , device_category
# , stream_name
# , round(sum(event_count),3) as carts
#  from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/events/`
# where event_name IN ('add_to_cart')
# group by date
# , device_category
# , stream_name)
# ,t2_orders as (select date
# , device_category
# , stream_name
# , round(sum(event_count),3) as orders
#  from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/events/`
# where event_name IN ('purchase', 'transaction')
# group by date
# , device_category
# , stream_name )
# , t3_sales as (select date
# , device_category
# , stream_name
# , round(sum(total_revenue-tax_amount-shipping_amount),3) as sales
#  from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/transactions/`
#  group by date
# , device_category
# , stream_name)
# , t4_sessions as
# (select  date
# , device_category
# , stream_name
# , sum(sessions) as sessions
#  from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/acquisition_sessions/`
# group by date
# , device_category
# , stream_name)
# ,t5_qty as (select  date
# , device_category
# , stream_name
# , sum(items_purchased) as qty
#  from delta.`abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/GA4/events_feb05_nov08/transaction_qty/`
# group by date
# , device_category
# , stream_name)
# select all_data.date
# , all_data.device_category
# , t1.stream_name
# , t1.carts
# , t2.orders
# , t3.sales
# , t4.sessions
# , t5.qty
# from all_data left join t1_carts t1 on all_data.date = t1.date and all_data.device_category = t1.device_category
#  left join t2_orders t2 on t1.date = t2.date and t1.device_category = t2.device_category
# and t1.stream_name = t2.stream_name
# left join t3_sales t3 on t1.date = t3.date and t1.device_category = t3.device_category
# and t1.stream_name = t3.stream_name
# left join t4_sessions t4 on t1.date = t4.date and t1.device_category = t4.device_category
# and t1.stream_name = t4.stream_name
# left join t5_qty t5 on t1.date = t5.date and t1.device_category = t5.device_category
# and t1.stream_name = t5.stream_name
# where all_data.date = '2023-11-08'""")

# df_stats = df_stats.groupBy('property_brand', 'brand_country','weekbegindate','weekenddate')\
# .pivot('metric')\
# .agg(F.first('cnt').alias('cnt'))


# #df_stats.createOrReplaceTempView('ga4_weekly_stats')
# display(df_stats)
# df_stats=  df_stats.repartition('weekenddate','property_brand','brand_country')\
#     .write.format("delta")\
#     .mode("overwrite")\
#     .option("path",f"{gold_adls_path}GA4/weekly_stats")\
#     .option("mergeSchema", "true")\
#     .partitionBy('weekenddate','property_brand','brand_country')\
#     .saveAsTable('lakedb_gold.ga4_weekly_stats')


In [None]:
select all_data.date
, all_data.device_category
, t1.stream_name
, t1.carts
, t2.orders
, t3.sales
, t4.sessions
, t5.qty
from all_data left join t1_carts t1 on all_data.date = t1.date and all_data.device_category = t1.device_category
 left join t2_orders t2 on t1.date = t2.date and t1.device_category = t2.device_category
and t1.stream_name = t2.stream_name
left join t3_sales t3 on t1.date = t3.date and t1.device_category = t3.device_category
and t1.stream_name = t3.stream_name
left join t4_sessions t4 on t1.date = t4.date and t1.device_category = t4.device_category
and t1.stream_name = t4.stream_name
left join t5_qty t5 on t1.date = t5.date and t1.device_category = t5.device_category
and t1.stream_name = t5.stream_name

In [None]:
%%sql
select * from lakedb_gold.ga4_weekly_stats
where 1=1
        and weekenddate= 20240601 order by property_brand,brand_country
   


In [None]:
%%sql
select weekenddate,count(*)
 from lakedb_gold.ga4_weekly_stats
 group by weekenddate
