In [0]:

daily_sales = spark.read.table("retail_gold_daily_sales")
top_products = spark.read.table("retail_gold_top_products")

print("📌 Daily Sales Schema:")
daily_sales.printSchema()

print("📌 Top Products Schema:")
top_products.printSchema()

display(daily_sales.limit(5))
display(top_products.limit(5))


In [0]:
%sql
-- Total revenue per country
SELECT Country, SUM(Revenue) AS TotalRevenue
FROM retail_gold_daily_sales
GROUP BY Country
ORDER BY TotalRevenue DESC
LIMIT 10;


In [0]:
%sql
-- Best selling products
SELECT ProductCode, ProductDesc, TotalQuantity
FROM retail_gold_top_products
ORDER BY TotalQuantity DESC
LIMIT 10;


In [0]:
import matplotlib.pyplot as plt

# Daily revenue for Germany
germany_sales = daily_sales.filter(daily_sales.Country == "Germany") \
    .orderBy("Date") \
    .toPandas()

plt.figure(figsize=(10,5))
plt.plot(germany_sales["Date"], germany_sales["Revenue"], marker="o")
plt.title("Daily Revenue - Germany")
plt.xlabel("Date")
plt.ylabel("Revenue (€)")
plt.xticks(rotation=45)
plt.show()


In [0]:
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, sum as _sum, date_trunc, round, to_date
# Aggregate total revenue across all countries per day
all_sales = daily_sales.groupBy("Date") \
    .agg(_sum("Revenue").alias("TotalRevenue")) \
    .orderBy("Date") \
    .toPandas()

plt.figure(figsize=(12,6))
plt.plot(all_sales["Date"], all_sales["TotalRevenue"], marker="o", color="navy")
plt.title("Total Daily Revenue (All Countries)", fontsize=14)
plt.xlabel("Date")
plt.ylabel("Revenue (€)")
plt.xticks(rotation=45)
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()


In [0]:
top_countries = daily_sales.groupBy("Country") \
    .agg(_sum("Revenue").alias("TotalRevenue")) \
    .orderBy(col("TotalRevenue").desc()) \
    .limit(10) \
    .toPandas()

plt.figure(figsize=(10,6))
plt.barh(top_countries["Country"], top_countries["TotalRevenue"], color="teal")
plt.title("Top 10 Countries by Total Revenue", fontsize=14)
plt.xlabel("Revenue (€)")
plt.gca().invert_yaxis()  # Largest on top
plt.show()


In [0]:
# Distribution of revenue values
revenue_dist = daily_sales.select("Revenue").toPandas()

plt.figure(figsize=(8,6))
plt.hist(revenue_dist["Revenue"].dropna(), bins=30, color="purple", alpha=0.7)
plt.title("Distribution of Daily Revenue", fontsize=14)
plt.xlabel("Revenue (€)")
plt.ylabel("Frequency")
plt.grid(True, alpha=0.4)
plt.show()


In [0]:
top_products_pd = top_products.orderBy(col("TotalQuantity").desc()).limit(10).toPandas()

plt.figure(figsize=(12,6))
plt.bar(top_products_pd["ProductDesc"], top_products_pd["TotalQuantity"], color="orange")
plt.title("Top 10 Products by Quantity Sold", fontsize=14)
plt.xlabel("Product")
plt.ylabel("Quantity Sold")
plt.xticks(rotation=75, ha="right")
plt.show()
