In [69]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, TimestampType
from pyspark.sql.functions import col, year, sum, to_timestamp, count, expr, max, count_distinct, expr, when, avg, min
from pyspark.sql import Window
from pyspark.sql.dataframe import DataFrame
import numpy as np

# df_updated = df.withColumns({
#     "Country": when((col("Country") == "America") & (col("Population") > 10000), "North America").otherwise(col("Country")),
#     "Region": when((col("Country") == "America") & (col("Population") > 10000), "NA").otherwise(col("Region"))
# })

# df_updated = df.withColumn(
#     "Country",
#     when(col("Country") == "America", 
#          when(col("Population") > 10000, "North America")
#          .when(col("Population") > 5000, "Central America")
#          .otherwise("South America"))
#     .otherwise(col("Country"))
# )

# df_updated = df.withColumn(
#     "Country",
#     expr("CASE WHEN Country = 'America' AND Population > 10000 THEN 'North America' ELSE Country END")
# )

# CASE 
#     WHEN Country = 'America' AND Population > 10000 THEN 'North America' 
#     WHEN Country = 'America' AND Population > 5000 THEN 'Central America'
#     WHEN Country = 'America' THEN 'South America' 
#     ELSE Country
# END


# Supponiamo che SparkSession sia già stato creato
ss: SparkSession = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = ".\\data\\"
    output_prefix= ".\\out\\"

invitations_path = f"{input_prefix}Invitations.txt"
meetings_path = f"{input_prefix}Meetings.txt"
users_path = f"{input_prefix}Users.txt"
output_folder_1 = f"{output_prefix}1/"
output_folder_2 = f"{output_prefix}2/"

In [70]:
invitations_schema = StructType([
    StructField("MID", StringType(), False),
    StructField("UID", StringType(), False),
    StructField("Accepted", StringType(), False)
])

invitations: DataFrame = ss.read.load(invitations_path,
    format="csv",
    header=False,
    schema=invitations_schema,
    sep=",")

invitations.show()

meetings_schema = StructType([
    StructField("MID", StringType(), False),
    StructField("Title", StringType(), False),
    StructField("StartTime", StringType(), False),
    StructField("Duration", IntegerType(), False),
    StructField("OrganizerUID", StringType(), False)
])

meetings: DataFrame = ss.read.load(meetings_path,
    format="csv",
    header=False,
    schema=meetings_schema,
    sep=",")

# Conversione del timestamp
meetings = meetings.withColumn(
    "StartTime",
    to_timestamp(col("StartTime"), "yyyy/MM/dd-HH:mm:ss")  # Adatta il formato del timestamp
)

meetings.show()

users_schema = StructType([
    StructField("UID", StringType(), False),
    StructField("Name", StringType(), False),
    StructField("Surname", StringType(), False),
    StructField("DateOfBirth", StringType(), False),
    StructField("PricingPlan", StringType(), False)
])

users: DataFrame = ss.read.load(users_path,
    format="csv",
    header=False,
    schema=users_schema,
    sep=",")

# Conversione del timestamp
users = users.withColumn(
    "DateOfBirth",
    to_timestamp(col("DateOfBirth"), "yyyy/MM/dd")  # Adatta il formato del timestamp
)

users.show()

+-------+--------+--------+
|    MID|     UID|Accepted|
+-------+--------+--------+
|MID1034|User1000|     Yes|
|MID1034|User1001|      No|
|MID1035|User1002| Unknown|
|MID1036|User1003|     Yes|
|MID1037|User1004|      No|
|MID1038|User1000|     Yes|
|MID1038|User1002|     Yes|
+-------+--------+--------+

+-------+--------------------+-------------------+--------+------------+
|    MID|               Title|          StartTime|Duration|OrganizerUID|
+-------+--------------------+-------------------+--------+------------+
|MID1034|Polito project ki...|2023-02-07 20:40:00|      90|    User1000|
|MID1035|  Marketing Strategy|2023-03-15 10:00:00|      60|    User1001|
|MID1036|           Tech Sync|2023-04-01 15:30:00|      45|    User1002|
|MID1037|Annual General Me...|2023-05-10 09:00:00|     120|    User1003|
|MID1038|      Product Launch|2023-06-20 14:00:00|      75|    User1004|
|MID1039|Polito project ki...|2023-02-07 20:40:00|      90|    User1000|
+-------+--------------------+----

# Punto 1

In [71]:
response1 = (
    meetings
    .join(users, col("OrganizerUID") == col("UID"))
    .filter(col("PricingPlan") == "Business")
    .groupBy(["UID"]).agg(avg(col("Duration")), max(col("Duration")), min(col("Duration")))
)

response1.show()

+--------+-------------+-------------+-------------+
|     UID|avg(Duration)|max(Duration)|min(Duration)|
+--------+-------------+-------------+-------------+
|User1000|         90.0|           90|           90|
|User1003|        120.0|          120|          120|
+--------+-------------+-------------+-------------+



In [72]:
response1.write.mode("overwrite").csv(output_folder_1)

# Punto 2

In [None]:
response2 = (
    meetings
    .join(users, col("OrganizerUID") == col("UID"))
    .filter(col("PricingPlan") == "Business")
    .select(col("MID"), col("OrganizerUID"))
    .join(invitations, on="MID", how="left")
    .groupBy("MID")
    .agg(count("*"))
    .withColumnRenamed("count(1)", "numberOfInvitations")
    .withColumn(
        "TypeOfMeeting",
        expr(
            """
            CASE 
                WHEN numberOfInvitations < 5 THEN 'Small' 
                WHEN numberOfInvitations >= 5 AND numberOfInvitations < 20 THEN 'Medium'
                WHEN numberOfInvitations >= 20 THEN 'Big' 
                ELSE 'None'
            END
            """
        )
    )
)

response2.show()

+-------+-------------------+-------------+
|    MID|numberOfInvitations|TypeOfMeeting|
+-------+-------------------+-------------+
|MID1034|                  2|        Small|
|MID1039|                  1|        Small|
|MID1037|                  1|        Small|
+-------+-------------------+-------------+

