In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import DateType, DoubleType

In [None]:
spark = SparkSession.builder.appName("Sales Analytics").getOrCreate()

In [None]:
columns = ["ORDERNUMBER", "QUANTITYORDERED", "PRICEEACH", "ORDERDATE", "PRODUCTLINE", "PRODUCTCODE"] # Only get specific column
sales = spark.read.csv("../data/sales_data_sample.csv", inferSchema=True, header=True)
sales = sales.select([col for col in columns])

In [None]:
# Rename Column
rename_lst = [
    ("ORDERNUMBER", "order_number"),
    ("QUANTITYORDERED", "quantity"),
    ("PRICEEACH", "price"),
    ("ORDERDATE", "order_date"),
    ("PRODUCTLINE", "product_line"),
    ("PRODUCTCODE", "product_code")
]
for old_name, new_name in rename_lst:
    sales = sales.withColumnRenamed(old_name, new_name)

#### Print the schema and the first few rows of the DataFrame

In [None]:
sales.printSchema()
sales.show(10)

#### Data Cleaning
- Handle any missing or inconsistent data in the dataset
- Convert the order_date column to a DateType

In [None]:
sales = sales.dropna(how="any")
sales = sales.withColumn("order_date", to_date(col("order_date"), "M/d/yyyy H:mm"))

In [None]:
sales.show()

#### Exploratory Data Analysis (EDA)
- Compute and print the total number of transactions.
- Find and print the top 5 products with the highest total sales (quantity * price).
- Calculate and print the average quantity sold per transaction.

In [None]:
# total number of transactions
sales.select("order_number").distinct().count()

In [None]:
# top 5 products with the highest total sales
sales.groupBy("product_code") \
    .agg(sum(col("quantity") * col("price")).alias("Total Sales")) \
    .orderBy(col("Total Sales").desc()).limit(5).show()

In [None]:
# average quantity sold per transaction
sales.groupBy("product_code").agg(avg(col("quantity")).alias("Average Quantity")).show()

#### Time-based Analysis
- Extract the year and month from the order.
- Compute and print the total sales for each month.
- Identify and print the month with the highest sales

In [None]:
# Extract the year and month from the order_date
sales = sales.withColumn("year", year(col("order_date").alias("Year"))) \
    .withColumn("month", month(col("order_date").alias("Month")))

In [None]:
# Total sales for each month
sales.groupBy("Year", "Month").agg(sum(col("quantity") * col("price")).alias("Total Sales In Month")).show()

In [None]:
# the month with the highest sales
sales.groupBy("Year", "Month"). \
    agg(sum(col("quantity") * col("price")).alias("Total Sales In Month")). \
    orderBy(col("Total Sales In Month").desc()).limit(1).show()

#### Aggregate Functions
- Calculate the total revenue generated from sales.
- Find the product with the highest average price

In [None]:
# Total revenue
sales.select(sum(col("quantity") * col("price")).alias("Total Revenue")).show()

In [None]:
# the product with the highest average price
sales.groupBy("product_code"). \
    agg(avg(col("price")).alias("Average Price")). \
    orderBy(col("Average Price").desc()).limit(1).show()

#### Advanced Analysis
- Implement a custom Spark UDF to calculate the discounted price for each product (assume a 10% discount).
- Compute and print the total discounted revenue.


In [None]:
price_with_discount = udf(lambda price: price * 0.9, DoubleType())
sales = sales.withColumn("price_with_discount", price_with_discount(col("price")))

In [None]:
sales.select(
    sum(col("price") * col("quantity")).alias("Total Revenue"),
    sum(col("price_with_discount") * col("quantity")).alias("Total Revenue with discount")
).show()

In [None]:
spark.stop()