In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
sales_file_location = "/FileStore/tables/Sales_table.csv"
products_file_location = "/FileStore/tables/Products_table.csv"
sellers_file_location = "/FileStore/tables/Sellers_table.csv"
file_type = "csv"
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

df_pt = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(products_file_location)

df_sales = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(sales_file_location)

df_seller = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(sellers_file_location)


In [None]:
# (a) Output the top 3 most popular products sold among all sellers [2m]

# df_pt.createOrReplaceTempView("PT")
# df_sales.createOrReplaceTempView("SALES")
# sql_text ="""
# SELECT PT.product_name
# FROM PT INNER JOIN SALES ON PT.product_id = SALES.product_id
# GROUP BY PT.product_name
# ORDER BY SUM(SALES.num_of_items_sold) DESC
# LIMIT 3;
# """
# result = spark.sql(sql_text)

from pyspark.sql.functions import expr, desc

result = df_pt.join(df_sales, "product_id")\
.groupBy("product_name")\
.agg(expr("sum(num_of_items_sold)").alias("total_number_of_items"))\
.orderBy(desc("total_number_of_items"))\
.limit(3)\
.select("product_name")

result.show()

# +-------------+
# | product_name|
# +-------------+
# |product_51270|
# |product_18759|
# |product_59652|
# +-------------+


+-------------+
| product_name|
+-------------+
|product_51270|
|product_18759|
|product_59652|
+-------------+



In [None]:
# (b) Output the top most sold product (in terms of quantity) among sellers with seller_id 1 to 10 [2m]
# Your table should have 1 column(s): [product_name] 

from pyspark.sql.functions import expr, col, desc

result = df_pt.join(df_sales, "product_id")\
.groupBy("seller_id", "product_name")\
.agg(expr("sum(num_of_items_sold)").alias("total_quantity"))\
.orderBy(desc("total_quantity"))\
.filter((col("seller_id") <= 10) & (col("seller_id") >= 1))\
.limit(1)\
.select("product_name")

result.show()

# +-------------+
# | product_name|
# +-------------+
# |product_36658|
# +-------------+


+-------------+
| product_name|
+-------------+
|product_36658|
+-------------+



In [None]:
# (c) Compute the combined revenue earned from sellers where seller_id ranges from 1 to 500 inclusive. [3m]
# Your table should have 1 column(s): [total_revenue]
from pyspark.sql.functions import expr, col, sum as _sum

result = df_pt.join(df_sales, "product_id")\
.groupBy("seller_id")\
.agg(expr("sum(num_of_items_sold * price)").alias("revenue"))\
.orderBy(desc("revenue"))\
.filter((col("seller_id") <= 500) & (col("seller_id") >= 1))\
.select(_sum("revenue").alias("total_revenue"))

result.show()

# +-------------+
# |total_revenue|
# +-------------+
# |    160916699|
# +-------------+


+-------------+
|total_revenue|
+-------------+
|    160916699|
+-------------+



In [None]:
# (d) Among sellers with rating >= 4 who have achieved a combined number of products sold >= 3000, find out the top 10 most expensive product sold by any of the sellers. (If there are multiple products at the same price, please sort them in ascending order of product_id) [8m]
# Your table should have 1 column(s): [product_name]
# To get the full mark, your query should not run for more than 1 min

from pyspark.sql.functions import expr, desc, asc, col, sum as _sum

df_max_num_by_seller = df_sales.join(
    df_seller.filter(col("rating") >= 4),
    on="seller_id",
    how="inner"
).groupBy("seller_id")\
.agg(expr("sum(num_of_items_sold)").alias("total_num_of_items_per_seller"))\
.orderBy(desc("total_num_of_items_per_seller"))\
.filter(col("total_num_of_items_per_seller") >= 3000)\
.select("*")

df_result = df_max_num_by_seller.join(df_sales, on=["seller_id"], how="left") \
    .join(df_pt, on=["product_id"], how="left")\
    .select("product_name", "price")\
    .distinct()\
    .orderBy(desc("price"), asc("product_id"))\
    .limit(10)\
    .select("product_name", "price")

df_result.show()

# +------------+-----+
# |product_name|price|
# +------------+-----+
# | product_106|  200|
# | product_117|  200|
# | product_363|  200|
# | product_712|  200|
# | product_843|  200|
# | product_897|  200|
# | product_923|  200|
# |product_1466|  200|
# |product_1507|  200|
# |product_1514|  200|
# +------------+-----+


+------------+-----+
|product_name|price|
+------------+-----+
| product_106|  200|
| product_117|  200|
| product_363|  200|
| product_712|  200|
| product_843|  200|
| product_897|  200|
| product_923|  200|
|product_1466|  200|
|product_1507|  200|
|product_1514|  200|
+------------+-----+

