In [2]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import col, year, sum
from pyspark.sql.dataframe import DataFrame
import numpy as np


# Supponiamo che SparkSession sia già stato creato
ss = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = "data/"
    output_prefix= "out/"

products_path = f"{input_prefix}products.txt"
prices_path = f"{input_prefix}prices.txt"
sales_path = f"{input_prefix}sales.txt"
output_folder_1 = f"{output_prefix}path_to_output_folder_1"  # Prodotti con vendite ridotte
output_folder_2 = f"{output_prefix}path_to_output_folder_2"  # Prodotto/i più venduti per anno



In [3]:
# Definizione degli schemi
# Schema per Products.txt
products_schema = StructType([
    StructField("ProductID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Category", StringType(), True)
])

# Schema per Prices.txt
prices_schema = StructType([
    StructField("ProductID", StringType(), True),
    StructField("StartingDate", DateType(), True),
    StructField("EndingDate", DateType(), True),
    StructField("Price", IntegerType(), True)
])

# Schema per Sales.txt
sales_schema = StructType([
    StructField("ProductID", StringType(), True),
    StructField("Date", DateType(), True),
    StructField("NumberOfProductsSold", IntegerType(), True)
])

# Lettura del file Products.txt
products: DataFrame = ss.read.load(products_path,
    format="csv",
    header=False,
    schema=products_schema)

# Lettura del file Prices.txt
prices: DataFrame = ss.read.load(prices_path,
    format="csv",
    header=False,
    schema=prices_schema)

# Lettura del file Sales.txt
sales: DataFrame = ss.read.load(sales_path,
    format="csv",
    header=False,
    schema=sales_schema)

# Visualizzazione dataframes per debug
products.show()
prices.show()
sales.show()
sales.count()

+---------+------------+-----------+
|ProductID|        Name|   Category|
+---------+------------+-----------+
|       P1|  Television|Electronics|
|       P2|Refrigerator| Appliances|
|       P3|      Laptop|Electronics|
|       P4|        Sofa|  Furniture|
|       P5|   Microwave| Appliances|
+---------+------------+-----------+

+---------+------------+----------+-----+
|ProductID|StartingDate|EndingDate|Price|
+---------+------------+----------+-----+
|       P1|  2019-01-01|2019-12-31|  500|
|       P1|  2021-01-01|2021-12-31|  450|
|       P2|  2019-01-01|2019-12-31|  800|
|       P2|  2021-01-01|2021-12-31|  750|
|       P3|  2019-01-01|2019-12-31| 1000|
|       P3|  2021-01-01|2021-12-31|  950|
|       P4|  2019-01-01|2019-12-31|  300|
|       P4|  2021-01-01|2021-12-31|  320|
|       P5|  2019-01-01|2019-12-31|  200|
|       P5|  2021-01-01|2021-12-31|  190|
+---------+------------+----------+-----+

+---------+----------+--------------------+
|ProductID|      Date|NumberOfPro

22

# Punto 1

In [4]:
# Punto 1

# Join tra products e sales  # year(col("date"))
sales_filtered: DataFrame = sales.filter(
    (year(col("Date")) == 2019) | (year(col("Date")) == 2021)
)

# sales_filtered.show()
# sales_filtered.count()

# Raggruppa per 'ProductID' e per anno
sales_for_year = sales_filtered.groupBy("ProductID", year(col("Date")).alias("Year")).agg({"NumberOfProductsSold": "sum"})

# Mostra i risultati
# sales_for_year.show()

sales_2019 = sales_for_year.filter(col("Year") == 2019).select("ProductID", col("Sum(NumberOfProductsSold)").alias("NumberOfProductsSold2019"))
sales_2021 = sales_for_year.filter(col("Year") == 2021).select("ProductID", col("Sum(NumberOfProductsSold)").alias("NumberOfProductsSold2021"))

decreased_sales = sales_2019.join(sales_2021, "ProductID").filter(col("NumberOfProductsSold2019") > col("NumberOfProductsSold2021"))

In [5]:
decreased_sales.select("ProductID").write.mode("overwrite").csv(output_folder_1)

# Punto 2

In [6]:
sales_for_product_and_year = sales.groupBy("ProductID", year(col("Date")).alias("Year")).agg({"NumberOfProductsSold": "sum"})
sales_for_product_and_year.show()

+---------+----+-------------------------+
|ProductID|Year|sum(NumberOfProductsSold)|
+---------+----+-------------------------+
|       P1|2019|                       15|
|       P3|2021|                       65|
|       P5|2021|                       44|
|       P5|2020|                       28|
|       P2|2019|                       35|
|       P4|2021|                       35|
|       P1|2021|                       12|
|       P2|2021|                       30|
|       P4|2019|                       25|
|       P5|2019|                       40|
|       P3|2019|                       55|
+---------+----+-------------------------+



In [27]:
sales_for_product_and_year_rdd = sales_for_product_and_year.rdd
# Debug
sales_for_product_and_year_rdd.collect()

max_per_year_df = sales_for_product_and_year.groupBy(col('Year')).agg({"sum(NumberOfProductsSold)": "max"})
max_per_year_df.show()

+----+------------------------------+
|Year|max(sum(NumberOfProductsSold))|
+----+------------------------------+
|2019|                            55|
|2020|                            28|
|2021|                            65|
+----+------------------------------+



In [None]:
# max_per_year = sales_for_product_and_year_rdd.map(lambda x: (x['Year'], x['sum(NumberOfProductsSold)'])).reduceByKey(lambda v1, v2: max(v1, v2))
# max_per_year.collect()

[(2019, 55), (2021, 65), (2020, 28)]

In [None]:
# map maxPerYear to
# key = (year, max count per year)
# value = None
# yearMaxNone = max_per_year.map(lambda p: (p, None))
# maxPidPerYear = sales_for_product_and_year_rdd.map(lambda p: ( (p['Year'], p['sum(NumberOfProductsSold)']), p['ProductID'])).join(yearMaxNone)
# maxPidPerYear.collect()

[((2021, 65), ('P3', None)),
 ((2020, 28), ('P5', None)),
 ((2019, 55), ('P3', None))]

In [28]:
# max_per_year_df: DataFrame = ss.createDataFrame(max_per_year, ['Year', 'NumberOfProductsSoldMax'])
pid_max_for_year = sales_for_product_and_year.join(
    max_per_year_df,
    (sales_for_product_and_year['Year'] == max_per_year_df['Year']) & 
    (sales_for_product_and_year['sum(NumberOfProductsSold)'] == max_per_year_df['max(sum(NumberOfProductsSold))']),
    "semi"  # tipo di join, può essere "inner", "left", "right" ecc.
)
pid_max_for_year.show()

+---------+----+-------------------------+
|ProductID|Year|sum(NumberOfProductsSold)|
+---------+----+-------------------------+
|       P3|2021|                       65|
|       P5|2020|                       28|
|       P3|2019|                       55|
+---------+----+-------------------------+



In [32]:
pid_max_for_year.select("ProductID", "Year").write.mode("overwrite").csv(output_folder_2)