In [20]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, TimestampType
from pyspark.sql.functions import col, year, sum, to_timestamp, count, expr, max, count_distinct, expr, when
from pyspark.sql import Window
from pyspark.sql.dataframe import DataFrame
import numpy as np

# df_updated = df.withColumns({
#     "Country": when((col("Country") == "America") & (col("Population") > 10000), "North America").otherwise(col("Country")),
#     "Region": when((col("Country") == "America") & (col("Population") > 10000), "NA").otherwise(col("Region"))
# })

# df_updated = df.withColumn(
#     "Country",
#     when(col("Country") == "America", 
#          when(col("Population") > 10000, "North America")
#          .when(col("Population") > 5000, "Central America")
#          .otherwise("South America"))
#     .otherwise(col("Country"))
# )

# df_updated = df.withColumn(
#     "Country",
#     expr("CASE WHEN Country = 'America' AND Population > 10000 THEN 'North America' ELSE Country END")
# )

# CASE 
#     WHEN Country = 'America' AND Population > 10000 THEN 'North America' 
#     WHEN Country = 'America' AND Population > 5000 THEN 'Central America'
#     WHEN Country = 'America' THEN 'South America' 
#     ELSE Country
# END


# Supponiamo che SparkSession sia già stato creato
ss: SparkSession = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = ".\\data\\"
    output_prefix= ".\\out\\"

customers_path = f"{input_prefix}Customers.txt"
customer_watched_path = f"{input_prefix}CustomerWatched.txt"
tv_series_path = f"{input_prefix}TvSeries.txt"
episodes_path = f"{input_prefix}Episodes.txt"
output_folder_1 = f"{output_prefix}1/"
output_folder_2 = f"{output_prefix}2/"

In [21]:
customers_schema = StructType([
    StructField("CID", StringType(), False),
    StructField("Name", StringType(), False),
    StructField("Surname", StringType(), False),
    StructField("City", StringType(), False),
    StructField("Country", StringType(), False)
])

customers: DataFrame = ss.read.load(customers_path,
    format="csv",
    header=False,
    schema=customers_schema,
    sep=",")

customers.show()

tv_series_schema = StructType([
    StructField("SID", StringType(), False),
    StructField("Title", StringType(), False),
    StructField("Genre", StringType(), False)
])

tv_series: DataFrame = ss.read.load(tv_series_path,
    format="csv",
    header=False,
    schema=tv_series_schema,
    sep=",")

tv_series.show()

episodes_schema = StructType([
    StructField("SID", StringType(), False),
    StructField("SeasonNumber", IntegerType(), False),
    StructField("EpisodeNumber", IntegerType(), False),
    StructField("Title", StringType(), False),
    StructField("OriginalAirDate", StringType(), False)
])

episodes: DataFrame = ss.read.load(episodes_path,
    format="csv",
    header=False,
    schema=episodes_schema,
    sep=",")

# Conversione del timestamp
episodes = episodes.withColumn(
    "OriginalAirDate",
    to_timestamp(col("OriginalAirDate"), "yyyy/MM/dd")  # Adatta il formato del timestamp
)

episodes.show()

customer_watched_schema = StructType([
    StructField("CID", StringType(), False),
    StructField("StartTimestamp", StringType(), False),
    StructField("SID", StringType(), False),
    StructField("SeasonNumber", IntegerType(), False),
    StructField("EpisodeNumber", IntegerType(), False)
])

customer_watched: DataFrame = ss.read.load(customer_watched_path,
    format="csv",
    header=False,
    schema=customer_watched_schema,
    sep=",")

# Conversione del timestamp
customer_watched = customer_watched.withColumn(
    "StartTimestamp",
    to_timestamp(col("StartTimestamp"), "yyyy/MM/dd-HH:mm")  # Adatta il formato del timestamp
)

customer_watched.show()

+-----+------+--------+--------+-------+
|  CID|  Name| Surname|    City|Country|
+-----+------+--------+--------+-------+
|CID10|  John|  Bianco|   Turin|  Italy|
|CID20|  Emma|   Smith|New York|    USA|
|CID30| James| Johnson|  London|     UK|
|CID40| Linda|Martinez|  Madrid|  Spain|
|CID50|Robert|  Garcia|   Paris| France|
+-----+------+--------+--------+-------+

+-----+---------------+-------+
|  SID|          Title|  Genre|
+-----+---------------+-------+
|SID15|        Friends| Comedy|
|SID20|     The Office| Comedy|
|SID25|Game of Thrones|Fantasy|
|SID30|Stranger Things| Sci-Fi|
|SID35|   Breaking Bad|  Drama|
+-----+---------------+-------+

+-----+------------+-------------+--------------------+-------------------+
|  SID|SeasonNumber|EpisodeNumber|               Title|    OriginalAirDate|
+-----+------------+-------------+--------------------+-------------------+
|SID15|           1|            1|               Pilot|1994-09-22 00:00:00|
|SID15|           1|            2|The

# Punto 1

In [22]:
response1 = (
    episodes.join(tv_series, "SID")
    .filter(col("Genre") == "Comedy")
    .groupBy(col("SID"), col("SeasonNumber"))
    .agg(count("*"))
    .withColumnRenamed("count(1)", "NumEpisodes")
    .groupBy(col("SID"))
    .agg(sum(col("NumEpisodes")) / count("*"))
    .withColumnRenamed("(sum(NumEpisodes) / count(1))", "Average number of episodes per season for the TV series SID")
)

response1.show()

+-----+-----------------------------------------------------------+
|  SID|Average number of episodes per season for the TV series SID|
+-----+-----------------------------------------------------------+
|SID20|                                                        2.0|
|SID15|                                                        1.5|
+-----+-----------------------------------------------------------+



# Punto 2

In [32]:
response2 = (
    customer_watched
    .groupBy("CID", "SID", "SeasonNumber")
    .agg(count("*"))
    .withColumnRenamed("count(1)", "TotalNumberOfEpisodes")
)

response2.show()

response3 = (
    episodes
    .groupBy(col("SID"), col("SeasonNumber"))
    .agg(count("*"))
    .withColumnRenamed("count(1)", "TotalNumberOfEpisodes")
)

response3.show()

response4 = response2.join(response3, ["SID", "SeasonNumber", "TotalNumberOfEpisodes"])

response4.show()

+-----+-----+------------+---------------------+
|  CID|  SID|SeasonNumber|TotalNumberOfEpisodes|
+-----+-----+------------+---------------------+
|CID10|SID20|           1|                    1|
|CID10|SID15|           1|                    2|
|CID50|SID30|           1|                    1|
|CID30|SID35|           1|                    1|
|CID50|SID25|           1|                    1|
|CID20|SID15|           2|                    1|
|CID40|SID20|           1|                    1|
+-----+-----+------------+---------------------+

+-----+------------+---------------------+
|  SID|SeasonNumber|TotalNumberOfEpisodes|
+-----+------------+---------------------+
|SID15|           1|                    2|
|SID20|           1|                    2|
|SID35|           1|                    1|
|SID30|           1|                    1|
|SID15|           2|                    1|
|SID25|           1|                    1|
+-----+------------+---------------------+

+-----+------------+---------