In [21]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, TimestampType
from pyspark.sql.functions import col, year, sum, to_timestamp, count, expr, max, count_distinct, expr, when, avg, min, month, floor, lag, desc
from pyspark.sql import Window
from pyspark.sql.dataframe import DataFrame
import numpy as np

# df_updated = df.withColumns({
#     "Country": when((col("Country") == "America") & (col("Population") > 10000), "North America").otherwise(col("Country")),
#     "Region": when((col("Country") == "America") & (col("Population") > 10000), "NA").otherwise(col("Region"))
# })

# df_updated = df.withColumn(
#     "Country",
#     when(col("Country") == "America", 
#          when(col("Population") > 10000, "North America")
#          .when(col("Population") > 5000, "Central America")
#          .otherwise("South America"))
#     .otherwise(col("Country"))
# )

# df_updated = df.withColumn(
#     "Country",
#     expr("CASE WHEN Country = 'America' AND Population > 10000 THEN 'North America' ELSE Country END")
# )

# CASE 
#     WHEN Country = 'America' AND Population > 10000 THEN 'North America' 
#     WHEN Country = 'America' AND Population > 5000 THEN 'Central America'
#     WHEN Country = 'America' THEN 'South America' 
#     ELSE Country
# END

# response2 = (
#     monthly_water_consumption
#     .withColumn("Year", year(col("Month")))
#     .groupBy(col("HID"), col("Year"))
#     .agg(sum("M3").alias("AnnualM3"))
#     .withColumn("PreviousAnnualM3", lag("AnnualM3").over(
#         Window
#         .partitionBy("HID")
#         .orderBy(col("Year"))
#     ))
#     .filter(col("PreviousAnnualM3") > col("AnnualM3")
# )

# .withColumn(
#     "HighNumberOfCitiesForCountry",
#     when(col("HighNumberOfCitiesForCountry").isNull(), 0)
#     .otherwise(col("HighNumberOfCitiesForCountry"))
# )
    


# Supponiamo che SparkSession sia già stato creato
ss: SparkSession = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = ".\\data\\"
    output_prefix= ".\\out\\"

applied_patches_path = f"{input_prefix}AppliedPatches.txt"
patches_path = f"{input_prefix}Patches.txt"
servers_path = f"{input_prefix}servers.txt"
output_folder_1 = f"{output_prefix}1/"
output_folder_2 = f"{output_prefix}2/"

In [22]:
applied_patches_schema = StructType([
    StructField("PID", StringType(), False),
    StructField("SID", StringType(), False),
    StructField("ApplicationDate", StringType(), False)
])

applied_patches: DataFrame = ss.read.load(applied_patches_path,
    format="csv",
    header=False,
    schema=applied_patches_schema,
    sep=",")

# Conversione del timestamp
applied_patches = applied_patches.withColumn(
    "ApplicationDate",
    to_timestamp(col("ApplicationDate"), "yyyy/MM/dd")  # Adatta il formato del timestamp
)

applied_patches.show()

patches_schema = StructType([
    StructField("PID", StringType(), False),
    StructField("ReleaseDate", StringType(), False),
    StructField("OperatingSystem", StringType(), False),
])

patches: DataFrame = ss.read.load(patches_path,
    format="csv",
    header=False,
    schema=patches_schema,
    sep=",")

# Conversione del timestamp
patches = patches.withColumn(
    "ReleaseDate",
    to_timestamp(col("ReleaseDate"), "yyyy/MM/dd")  # Adatta il formato del timestamp
)

patches.show()

servers_schema = StructType([
    StructField("SID", StringType(), False),
    StructField("OperatingSystem", StringType(), False),
    StructField("Model", StringType(), False)
])

servers: DataFrame = ss.read.load(servers_path,
    format="csv",
    header=False,
    schema=servers_schema,
    sep=",")

servers.show()

+---------+---+-------------------+
|      PID|SID|    ApplicationDate|
+---------+---+-------------------+
|PIDW10_22|S10|2022-02-21 00:00:00|
|PIDW10_23|S10|2022-03-10 00:00:00|
|PIDW10_24|S11|2022-04-11 00:00:00|
|PIDW10_25|S14|2022-07-01 00:00:00|
|PIDW10_26|S13|2022-08-20 00:00:00|
|PIDW10_27|S12|2022-09-15 00:00:00|
|PIDW10_22|S11|2022-02-21 00:00:00|
|PIDW10_22|S13|2022-02-21 00:00:00|
|PIDW10_22|S14|2022-02-21 00:00:00|
+---------+---+-------------------+

+---------+-------------------+---------------+
|      PID|        ReleaseDate|OperatingSystem|
+---------+-------------------+---------------+
|PIDW10_22|2022-02-21 00:00:00|        Ubuntu6|
|PIDW10_23|2022-02-20 00:00:00|        Ubuntu6|
|PIDW10_24|2022-03-15 00:00:00|      Windows10|
|PIDW10_25|2022-06-10 00:00:00|      Windows10|
|PIDW10_26|2022-07-01 00:00:00|        RedHat8|
|PIDW10_27|2022-08-05 00:00:00|        Ubuntu6|
+---------+-------------------+---------------+

+---+---------------+---------------+
|SID|Operati

# Punto 1

In [23]:
response1 = (
    applied_patches
    .join(patches, on="PID")
    .filter((col("OperatingSystem") == "Ubuntu6") & (col("ReleaseDate") == col("ApplicationDate")))
    .groupBy("PID")
    .agg(count("*"))
    .filter(col("count(1)") > 2)
    .select("PID")
)

response1.show()

+---------+
|      PID|
+---------+
|PIDW10_22|
+---------+



# Punto 2

In [36]:
response2 = (
    applied_patches
    .withColumn(
        "Month",
        month(col("ApplicationDate"))
    )
    .select("SID", "Month")
    .groupBy("SID")
    .agg(count_distinct("Month"))
    .select("SID", 12-col("count(DISTINCT Month)"))
    .withColumnRenamed("(12 - count(DISTINCT Month))", "NumMonthNotApplied")
    .join(servers, "SID", "right")
    .withColumn(
        "NumMonthNotApplied",
        when(col("NumMonthNotApplied").isNull(), 12).otherwise(col("NumMonthNotApplied"))
    )
    .select("SID", "NumMonthNotApplied")
)

response2.show()

+---+------------------+
|SID|NumMonthNotApplied|
+---+------------------+
|S10|                10|
|S11|                10|
|S12|                11|
|S13|                10|
|S14|                10|
|S15|                12|
+---+------------------+

