In [0]:
silver_df = spark.read.format("parquet").load("wasbs://silver@retaildatalakejp.blob.core.windows.net/OnlineRetailCleaned.parquet")
silver_df.display()



InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country,SalesAmount,InvoiceYear,InvoiceMonth,InvoiceDay,InvoiceHour
536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2.55,17850,United Kingdom,15.3,2010,12,1,8
536365,71053,WHITE METAL LANTERN,6,3.39,17850,United Kingdom,20.34,2010,12,1,8
536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2.75,17850,United Kingdom,22.0,2010,12,1,8
536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,3.39,17850,United Kingdom,20.34,2010,12,1,8
536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,3.39,17850,United Kingdom,20.34,2010,12,1,8
536365,22752,SET 7 BABUSHKA NESTING BOXES,2,7.65,17850,United Kingdom,15.3,2010,12,1,8
536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,4.25,17850,United Kingdom,25.5,2010,12,1,8
536366,22633,HAND WARMER UNION JACK,6,1.85,17850,United Kingdom,11.1,2010,12,1,8
536366,22632,HAND WARMER RED POLKA DOT,6,1.85,17850,United Kingdom,11.1,2010,12,1,8
536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,1.69,13047,United Kingdom,54.08,2010,12,1,8


In [0]:
from pyspark.sql.functions import *

# 1. Country-level Total Sales
country_sales_df = silver_df.groupBy("Country").agg(
    round(sum("SalesAmount"),2).alias("TotalSales"),
    countDistinct("CustomerID").alias("UniqueCustomers"),
    count("InvoiceNo").alias("TotalTransactions")
)

country_sales_df.display()

country_sales_df.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/country_level_total_sales.parquet")

Country,TotalSales,UniqueCustomers,TotalTransactions
Sweden,24456.55,8,290
Singapore,19699.21,1,167
Germany,202050.01,94,6923
RSA,981.97,1,54
France,183549.73,87,6335
Greece,4553.42,4,122
European Community,1194.65,1,52
Belgium,34926.92,25,1536
Finland,20111.76,12,508
Malta,2652.99,2,102


In [0]:
country_sales_df.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/country_level_total_sales/")


In [0]:
%sql
USE CATALOG retail_catalog;

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS gold;
USE SCHEMA gold;

In [0]:
%sql
CREATE TABLE IF NOT EXISTS country_level_total_sales
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/country_level_total_sales/';

In [0]:
# 2. Monthly Revenue Trends
monthly_sales_df = silver_df.groupBy("InvoiceYear", "InvoiceMonth").agg(
    round(sum("SalesAmount"),2).alias("MonthlyRevenue")
).orderBy("InvoiceYear", "InvoiceMonth")

monthly_sales_df.display()

monthly_sales_df.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/monthly_revenue_trends.parquet")

InvoiceYear,InvoiceMonth,MonthlyRevenue
2010,12,509239.1
2011,1,516132.35
2011,2,388097.54
2011,3,521947.66
2011,4,403484.66
2011,5,597559.63
2011,6,584606.74
2011,7,517126.63
2011,8,564748.05
2011,9,839985.55


In [0]:
monthly_sales_df.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/monthly_sales_df/")

In [0]:
%sql
CREATE TABLE IF NOT EXISTS monthly_revenue_trends
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/monthly_sales_df/';

In [0]:
# 3. Hourly Revenue Trends
hourly_sales_df = silver_df.groupBy("InvoiceYear", "InvoiceHour").agg(
    round(sum("SalesAmount"),2).alias("HourlyRevenue")
).orderBy("InvoiceYear", "InvoiceHour")

hourly_sales_df.display()

hourly_sales_df.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/hourly_revenue_trends.parquet")

hourly_sales_df.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/hourly_sales_df/")





InvoiceYear,InvoiceHour,HourlyRevenue
2010,7,422.8
2010,8,5656.84
2010,9,41456.93
2010,10,64669.42
2010,11,57474.21
2010,12,73909.96
2010,13,70537.07
2010,14,46067.28
2010,15,69707.46
2010,16,55574.89


In [0]:
%sql
CREATE TABLE IF NOT EXISTS hourly_sales
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/hourly_sales_df/';

In [0]:
# 4. Top 10 Products by Revenue
top_products_df = silver_df.groupBy("Description").agg(
    round(sum("SalesAmount"),2).alias("TotalRevenue")
).orderBy(desc("TotalRevenue")).limit(10)

top_products_df.display()

top_products_df.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/top_10_products_by_revenue.parquet")

top_products_df.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/top_products_df/")


Description,TotalRevenue
"PAPER CRAFT , LITTLE BIRDIE",168469.6
REGENCY CAKESTAND 3 TIER,142592.95
WHITE HANGING HEART T-LIGHT HOLDER,100448.15
JUMBO BAG RED RETROSPOT,85220.78
MEDIUM CERAMIC TOP STORAGE JAR,81416.73
POSTAGE,77803.96
PARTY BUNTING,68844.33
ASSORTED COLOUR BIRD ORNAMENT,56580.34
Manual,52315.65
RABBIT NIGHT LIGHT,51346.2


In [0]:
%sql
CREATE TABLE IF NOT EXISTS top_products
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/top_products_df/';

In [0]:
# 3. Top 10 Countries by Revenue
top_countries_df = country_sales_df.orderBy(desc("TotalSales")).limit(10)
top_countries_df.display()

top_countries_df.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/top_10_countries_by_revenue.parquet")

top_countries_df.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/top_countries_df/")


Country,TotalSales,UniqueCustomers,TotalTransactions
United Kingdom,6459137.29,3887,271313
Netherlands,245279.99,9,1723
EIRE,236665.02,3,5654
Germany,202050.01,94,6923
France,183549.73,87,6335
Australia,122974.01,9,945
Spain,56444.29,30,1983
Switzerland,50176.92,21,1403
Belgium,34926.92,25,1536
Norway,32184.1,10,836


In [0]:
%sql
CREATE TABLE IF NOT EXISTS top_countries
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/top_countries_df/';

In [0]:
#Monthly Customer Acquisition

monthly_customers = silver_df.select("CustomerID", "InvoiceYear", "InvoiceMonth").distinct() \
    .groupBy("InvoiceYear", "InvoiceMonth").agg(count("CustomerID").alias("NewCustomers"))

monthly_customers.display()

monthly_customers.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/monthly_customer_aquistion.parquet")

monthly_customers.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/monthly_customers/")


InvoiceYear,InvoiceMonth,NewCustomers
2010,12,876
2011,3,965
2011,2,742
2011,4,849
2011,12,613
2011,8,925
2011,5,1045
2011,7,939
2011,6,979
2011,11,1653


In [0]:
%sql
CREATE TABLE IF NOT EXISTS monthly_customer_acquistion
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/monthly_customers/';

In [0]:
#Churned Customers (No Purchase in Last X Months)

active_customers = silver_df.filter(silver_df.InvoiceYear == 2011).select("CustomerID").distinct()
churned_customers = silver_df.filter(silver_df.InvoiceYear == 2010).select("CustomerID").distinct() \
                     .subtract(active_customers)

churned_customers.display()

churned_customers.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/churned_customers.parquet")

churned_customers.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/churned_customers/")



CustomerID
17551
17855
13065
16510
17303
15899
15350
18113
12967
15180


In [0]:
%sql
CREATE TABLE IF NOT EXISTS churned_customers
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/churned_customers/';


In [0]:
#Basket Size (Average items per invoice)

basket_size = silver_df.groupBy("InvoiceNo").agg(sum("Quantity").alias("TotalItems"))
avg_basket_size = basket_size.agg(avg("TotalItems")).first()[0]

display(spark.createDataFrame([(avg_basket_size,)], ['AverageBasketSize']))

AverageBasketSize
171.9623865823481


In [0]:
#Geo Analysis: Sales by Country

country_sales = silver_df.groupBy("Country").agg(
    sum("SalesAmount").alias("Revenue"),
    countDistinct("CustomerID").alias("Customers")
).orderBy(desc("Revenue"))

country_sales.display()

country_sales.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/sales_by_country.parquet")

country_sales.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/country_sales/")


Country,Revenue,Customers
United Kingdom,6459137.290000001,3887
Netherlands,245279.99000000028,9
EIRE,236665.02000000025,3
Germany,202050.00999999992,94
France,183549.73000000004,87
Australia,122974.00999999983,9
Spain,56444.29,30
Switzerland,50176.920000000006,21
Belgium,34926.92,25
Norway,32184.1,10


In [0]:

%sql
CREATE TABLE IF NOT EXISTS sales_by_country
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/country_sales/';


In [0]:
#Customer Lifetime Value (LTV)

ltv_df = silver_df.groupBy("CustomerID").agg(
    sum("SalesAmount").alias("LifetimeValue")
).orderBy(desc("LifetimeValue"))

ltv_df.display()

ltv_df.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/customer_life_time_value.parquet")

ltv_df.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/ltv_df/")



CustomerID,LifetimeValue
18102,257518.9
14646,241148.3100000003
17450,192236.59000000003
16446,168472.5
14911,125365.19000000018
12415,110334.20999999985
14156,107573.14000000006
17511,77796.45999999995
12346,77183.6
16029,76512.83999999998


In [0]:
 %sql
CREATE TABLE IF NOT EXISTS customer_lifetime_value
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/ltv_df/';



In [0]:
#Sales Trends Over Time

monthly_trend = silver_df.groupBy("InvoiceYear", "InvoiceMonth").agg(
    round(sum("SalesAmount"),2).alias("Revenue")
).orderBy("InvoiceYear", "InvoiceMonth")

monthly_trend.display()

monthly_trend.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/sales_trend_over_time.parquet")

monthly_trend.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/monthly_trend/")




InvoiceYear,InvoiceMonth,Revenue
2010,12,509239.1
2011,1,516132.35
2011,2,388097.54
2011,3,521947.66
2011,4,403484.66
2011,5,597559.63
2011,6,584606.74
2011,7,517126.63
2011,8,564748.05
2011,9,839985.55


In [0]:
%sql
CREATE TABLE IF NOT EXISTS sales_trend_over_time
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/monthly_trend/';


In [0]:
#Find the busiest hours of sales and analyze regional patterns


hourly_region_sales = silver_df.groupBy("Country", "InvoiceHour") \
    .agg(round(sum("SalesAmount"),2).alias("TotalRevenue")) \
    .orderBy("Country", "InvoiceHour",desc("TotalRevenue"))

hourly_region_sales.display()

hourly_region_sales.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/hourly_region_sales.parquet")

hourly_region_sales.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/hourly_region_sales/")


Country,InvoiceHour,TotalRevenue
Australia,8,20307.12
Australia,9,14417.71
Australia,10,20866.36
Australia,11,8347.08
Australia,12,19990.51
Australia,13,23889.96
Australia,14,3399.09
Australia,15,11756.18
Austria,9,1453.02
Austria,10,2550.94


In [0]:

%sql
CREATE TABLE IF NOT EXISTS busiest_hours_of_sales
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/hourly_region_sales/';


In [0]:
#What percentage of customers are single-purchase vs repeat?
from pyspark.sql.functions import countDistinct, when, col

# Step 1: Count purchases per customer
purchase_counts = silver_df.groupBy("CustomerID") \
    .agg(countDistinct("InvoiceNo").alias("NumPurchases"))

# Step 2: Tag each as 'Single' or 'Repeat'
purchase_tags = purchase_counts.withColumn(
    "CustomerType",
    when(col("NumPurchases") == 1, "Single-Purchase").otherwise("Repeat-Customer")
)

# Step 3: Get counts and percentages
total_customers = purchase_tags.count()
type_counts = purchase_tags.groupBy("CustomerType").count()

# Add percentage
type_percentages = type_counts.withColumn(
    "Percentage",
    (col("count") / total_customers * 100).cast("double")
)

type_percentages.show()

type_percentages.write.mode("overwrite").parquet("wasbs://gold@retaildatalakejp.blob.core.windows.net/single_vs_repeated_customers.parquet")

type_percentages.write.format("delta").mode("overwrite") \
  .save("abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/type_percentages/")



In [0]:

%sql
CREATE TABLE IF NOT EXISTS single_vs_repeat_customers
USING DELTA
LOCATION 'abfss://unity-catalog-storage@dbstoraged6a6yck2px35s.dfs.core.windows.net/133676615018953/gold/type_percentages/';
