In [45]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, TimestampType
from pyspark.sql.functions import col, year, sum, to_timestamp, count, expr, max, count_distinct
from pyspark.sql import Window
from pyspark.sql.dataframe import DataFrame
import numpy as np

# Supponiamo che SparkSession sia già stato creato
ss = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = ".\\data\\"
    output_prefix= ".\\out\\"

catalouges_path = f"{input_prefix}Catalouges.txt"
users_path = f"{input_prefix}Users.txt"
purchases_path = f"{input_prefix}Purchases.txt"
output_folder_1 = f"{output_prefix}1/"
output_folder_2 = f"{output_prefix}2/"

In [46]:
catalouges_schema = StructType([
    StructField("ItemID", StringType(), False),
    StructField("Name", StringType(), False),
    StructField("Category", StringType(), False),
    StructField("StillinProduction", StringType(), False),
])

catalouges: DataFrame = ss.read.load(catalouges_path,
    format="csv",
    header=False,
    schema=catalouges_schema)

catalouges.show()

users_schema = StructType([
    StructField("UserID", StringType(), False),
    StructField("Name", StringType(), False),
    StructField("Surname", StringType(), False),
    StructField("City", StringType(), False),
    StructField("Country", StringType(), False)
])

users: DataFrame = ss.read.load(users_path,
    format="csv",
    header=False,
    schema=users_schema)

users.show()

purchases_schema = StructType([
    StructField("SaleTimestamp", StringType(), False),
    StructField("UserID", StringType(), False),
    StructField("ItemID", StringType(), False),
    StructField("Country", DoubleType(), False)
])

purchases: DataFrame = ss.read.load(purchases_path,
    format="csv",
    header=False,
    schema=purchases_schema)

# Conversione del timestamp
purchases = purchases.withColumn(
    "SaleTimestamp",
    to_timestamp(col("SaleTimestamp"), "yyyy/MM/dd-HH:mm:ss")  # Adatta il formato del timestamp
)

purchases.show()

+-------+------------+--------------+-----------------+
| ItemID|        Name|      Category|StillinProduction|
+-------+------------+--------------+-----------------+
|Item123| SmartphoneX|   Electronics|             True|
|Item234|     LaptopY|   Electronics|            False|
|Item345|    BlenderZ|HomeAppliances|             True|
|Item456| T-ShirtBlue|      Clothing|             True|
|Item567|RunningShoes|      Clothing|            False|
|Item678|     BookABC|         Books|             True|
|Item789| GameConsole|   Electronics|             True|
|Item890|  SmartWatch|   Electronics|             True|
+-------+------------+--------------+-----------------+

+------+-------+--------+-----------+--------+
|UserID|   Name| Surname|       City| Country|
+------+-------+--------+-----------+--------+
|User01|   John|     Doe|   New York|     USA|
|User02|   Jane|   Smith|     London|      UK|
|User03|  Paolo|   Garza|      Turin|   Italy|
|User04|    Mei|      Li|    Beijing|   China

# Point 1

In [47]:
# Step 1: Aggregate purchases by UserID and Year
response1 = (
    purchases.filter((year(col("SaleTimestamp")) == 2022) | (year(col("SaleTimestamp")) == 2023))
    .groupBy(col("UserID"), year(col("SaleTimestamp")).alias("Year"))
    .agg(count(expr("*")).alias("TotalPurchases"))
)

# Step 2: Define a window specification for each year
window_spec = Window.partitionBy("Year")

# Step 3: Add a column for the maximum purchases for each year
response1 = response1.withColumn("MaxPurchaseForEachYear", max("TotalPurchases").over(window_spec))

response1 = response1.filter(col("TotalPurchases") == col("MaxPurchaseForEachYear"))

response1 = response1.select("UserID")

# Show the result
response1.show()

response1.write.mode("overwrite").csv(output_folder_1)

+------+
|UserID|
+------+
|User04|
|User02|
|User03|
|User05|
+------+



# Point 2

In [57]:
window_spec = Window.partitionBy("Year", "Category")

response2 = (
    purchases.join(catalouges, "ItemID")
    .filter((year(col("SaleTimestamp")) == 2022) | (year(col("SaleTimestamp")) == 2023))
    .groupBy(col("ItemID"), col("Category"), year(col("SaleTimestamp")).alias("Year"))
    .agg(count_distinct(col("UserID")).alias("DistinctUserIdForYear"))
)

response2 = response2.withColumn("MaxDistinctUserIDForEachYear", max(col("DistinctUserIdForYear")).over(window_spec))

response2 = response2.filter(col("DistinctUserIdForYear") == col("MaxDistinctUserIDForEachYear"))

response2 = response2.select("Category", "ItemID")

response2.show()

response2.write.mode("overwrite").csv(output_folder_2)

+-----------+-------+
|   Category| ItemID|
+-----------+-------+
|   Clothing|Item456|
|      Books|Item678|
|Electronics|Item890|
|Electronics|Item789|
+-----------+-------+

