In [7]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import col, sum, dense_rank, round
from pyspark.sql.window import Window

In [8]:
# Spark Configuration
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Best_Selling_Products_Pipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

data_bucket_uri = "data_de2024_a2"
temp_bucket = "temp_de2024_mh"
project_id = "core-synthesis-435410-v9"

In [9]:
# Load Fact and Dimension Tables
factDF = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(f"gs://{data_bucket_uri}/fact_table.csv")
itemDF = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(f"gs://{data_bucket_uri}/item_dim.csv")
storeDF = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(f"gs://{data_bucket_uri}/store_dim.csv")

In [10]:
factDF = factDF.select("item_key", "store_key", "total_price")
itemDF = itemDF.select("item_key", "item_name")
storeDF = storeDF.select("store_key")

# Combine fact table with item and store dimensions
joinedDF = factDF.join(itemDF, "item_key").join(storeDF, "store_key")
print(joinedDF.printSchema())
joinedDF.show()

root
 |-- store_key: string (nullable = true)
 |-- item_key: string (nullable = true)
 |-- total_price: double (nullable = true)
 |-- item_name: string (nullable = true)

None
+---------+--------+-----------+--------------------+
|store_key|item_key|total_price|           item_name|
+---------+--------+-----------+--------------------+
|   S00307|  I00177|       35.0|M&M Peanut Candy ...|
|   S00595|  I00248|       26.0|Charmin Ultra Bat...|
|   S00496|  I00195|      100.0|Dole Fruit in Gel...|
|    S0086|  I00131|      112.0|Paper Bowls 20 oz...|
|   S00488|  I00050|       64.0|Waterloo Sparklin...|
|   S00328|  I00058|      110.0|Premier Protein S...|
|   S00196|  I00075|       31.0|Brisk Lemon Iced ...|
|    S0010|  I00188|       14.0|Belvita Protein O...|
|   S00164|  I00017|      74.25|Fresca Black Cher...|
|   S00640|  I00133|      150.0|Clear Plastic Cup...|
|   S00540|  I00065|      160.0|G2 Lo Calorie Var...|
|    S0032|  I00023|       67.5|  Pepsi - 12 oz cans|
|   S00631|  I

In [13]:
# Calculate total sales for each product in each store
storeProductSalesDF = joinedDF.groupBy("store_key", "item_key", "item_name") \
    .sum("total_price") \
    .withColumnRenamed("sum(total_price)", "product_sales")

# Total sales for each store
storeTotalSalesDF = storeProductSalesDF.groupBy("store_key") \
    .sum("product_sales") \
    .withColumnRenamed("sum(product_sales)", "store_total_sales")

# Rank stores by their total sales
storeRankWindow = Window.orderBy(col("store_total_sales").desc())
storeTotalSalesDF = storeTotalSalesDF.withColumn("store_rank", dense_rank().over(storeRankWindow))

# Rank products within each store based on their sales
productRankWindow = Window.partitionBy("store_key").orderBy(col("product_sales").desc())
storeProductSalesDF = storeProductSalesDF.withColumn("product_rank", dense_rank().over(productRankWindow))

# Exclude products with zero sales
storeProductSalesDF = storeProductSalesDF.where(col("product_sales") > 0)

# Select the top 100 stores by total sales (rank)
top100StoresDF = storeTotalSalesDF.where(col("store_rank") <= 100)

# Join product details for the top 100 stores
finalDF = storeProductSalesDF.join(top100StoresDF, "store_key")

finalDF.show()

+---------+--------+--------------------+-------------+------------+-----------------+----------+
|store_key|item_key|           item_name|product_sales|product_rank|store_total_sales|store_rank|
+---------+--------+--------------------+-------------+------------+-----------------+----------+
|    S0010|  I00119|K Cups Original D...|       3021.0|           1|         159409.0|         1|
|    S0010|  I00123|     Honey Packets  |       2565.0|           2|         159409.0|         1|
|    S0010|  I00061|       Red Bull 12oz|       2255.0|           3|         159409.0|         1|
|    S0010|  I00177|M&M Peanut Candy ...|       2240.0|           4|         159409.0|         1|
|    S0010|  I00054|Monster Zero Ultr...|       2160.0|           5|         159409.0|         1|
|    S0010|  I00183| Snickers Bars 1.8oz|       1890.0|           6|         159409.0|         1|
|    S0010|  I00140|Foam Coffee Cups ...|       1885.0|           7|         159409.0|         1|
|    S0010|  I00117|

In [None]:
# Save to BigQuery
spark.conf.set('temporaryGcsBucket', temp_bucket)

finalDF.write.format('bigquery') \
    .option('table', f'{project_id}.a2.product_sales_by_store') \
    .mode("overwrite") \
    .save()

# Stop Spark Session
spark.stop()