In [28]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, TimestampType, NullType
from pyspark.sql.functions import col, year, sum, to_timestamp, count, expr, max, count_distinct, expr, when, avg, min, month, floor, lag, desc
from pyspark.sql import Window
from pyspark.sql.dataframe import DataFrame
import numpy as np

# df_updated = df.withColumns({
#     "Country": when((col("Country") == "America") & (col("Population") > 10000), "North America").otherwise(col("Country")),
#     "Region": when((col("Country") == "America") & (col("Population") > 10000), "NA").otherwise(col("Region"))
# })

# df_updated = df.withColumn(
#     "Country",
#     when(col("Country") == "America", 
#          when(col("Population") > 10000, "North America")
#          .when(col("Population") > 5000, "Central America")
#          .otherwise("South America"))
#     .otherwise(col("Country"))
# )

# df_updated = df.withColumn(
#     "Country",
#     expr("CASE WHEN Country = 'America' AND Population > 10000 THEN 'North America' ELSE Country END")
# )

# CASE 
#     WHEN Country = 'America' AND Population > 10000 THEN 'North America' 
#     WHEN Country = 'America' AND Population > 5000 THEN 'Central America'
#     WHEN Country = 'America' THEN 'South America' 
#     ELSE Country
# END

# response2 = (
#     monthly_water_consumption
#     .withColumn("Year", year(col("Month")))
#     .groupBy(col("HID"), col("Year"))
#     .agg(sum("M3").alias("AnnualM3"))
#     .withColumn("PreviousAnnualM3", lag("AnnualM3").over(
#         Window
#         .partitionBy("HID")
#         .orderBy(col("Year"))
#     ))
#     .filter(col("PreviousAnnualM3") > col("AnnualM3")
# )

# .withColumn(
#     "HighNumberOfCitiesForCountry",
#     when(col("HighNumberOfCitiesForCountry").isNull(), 0)
#     .otherwise(col("HighNumberOfCitiesForCountry"))
# )
    


# Supponiamo che SparkSession sia già stato creato
ss: SparkSession = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = ".\\data\\"
    output_prefix= ".\\out\\"


patches_path = f"{input_prefix}Patches.txt"
output_folder_1 = f"{output_prefix}1/"
output_folder_2 = f"{output_prefix}2/"

In [29]:
patches_schema = StructType([
    StructField("PID", StringType(), False),
    StructField("Date", StringType(), False),
    StructField("ApplicationName", StringType(), False),
    StructField("BriefDescription", StringType(), False),
])

patches: DataFrame = ss.read.load(patches_path,
    format="csv",
    header=False,
    schema=patches_schema,
    sep=",")

# Conversione del timestamp
patches = patches.withColumn(
    "Date",
    to_timestamp(col("Date"), "yyyy/MM/dd")  # Adatta il formato del timestamp
)


patches.show()



+-------+-------------------+---------------+--------------------+
|    PID|               Date|ApplicationName|    BriefDescription|
+-------+-------------------+---------------+--------------------+
|PID7000|2017-01-05 00:00:00|     Windows 10|      Security patch|
|PID7001|2017-01-12 00:00:00|     Windows 10|             Bug fix|
|PID7002|2017-01-20 00:00:00|   Ubuntu 18.04|  Performance update|
|PID7003|2017-01-25 00:00:00|     Windows 10|       Kernel update|
|PID7004|2017-02-10 00:00:00|     Windows 10|      Security patch|
|PID7005|2017-02-15 00:00:00|   Ubuntu 18.04|             Bug fix|
|PID7006|2017-02-20 00:00:00|   Ubuntu 18.04|       Kernel update|
|PID7050|2017-02-10 00:00:00|     Windows 10|     Security patch2|
|PID7007|2017-03-02 00:00:00|     Windows 10|       Driver update|
|PID7008|2017-03-08 00:00:00|   Ubuntu 18.04|      Security patch|
|PID7009|2017-03-15 00:00:00|   Ubuntu 18.04| System optimization|
|PID7010|2017-03-22 00:00:00|   Ubuntu 18.04|             Bug 

# Punto 1

In [36]:
response1 = (
    patches
    .filter(((col("ApplicationName") == "Ubuntu 18.04") | (col("ApplicationName") == "Windows 10")) & (year(col("Date")) == 2017))
    .withColumn(
        "Month",
        month(col("Date"))
    )
    .groupBy("ApplicationName", "Month")
    .agg(count("*"))
    .withColumnRenamed(
        "count(1)",
        "NumPatches"
    )
    .withColumn(
        "NumPatchesOtherSoftware",
        lag("NumPatches").over(
            Window
            .partitionBy("Month")
            .orderBy("ApplicationName")
        )
    )
    .filter(col("NumPatchesOtherSoftware").isNotNull())
    .withColumn(
        "Result",
        when(col("NumPatches") > col("NumPatchesOtherSoftware"), "W")
        .when(col("NumPatches") < col("NumPatchesOtherSoftware"), "U")
        .otherwise("NULL")
    )
    .filter(col("Result") != "NULL")
)

response1.show()

+---------------+-----+----------+-----------------------+------+
|ApplicationName|Month|NumPatches|NumPatchesOtherSoftware|Result|
+---------------+-----+----------+-----------------------+------+
|     Windows 10|    1|         3|                      1|     W|
|     Windows 10|    3|         1|                      3|     U|
+---------------+-----+----------+-----------------------+------+



In [41]:
response2 = (
    patches
    .filter((year(col("Date")) == 2017))
    .withColumn(
        "Month",
        month(col("Date"))
    )
    .groupBy("ApplicationName", "Month")
    .agg(count("*"))
    .withColumnRenamed(
        "count(1)",
        "NumPatches"
    )
    .sort("ApplicationName", "Month")
    .withColumn(
        "MinNumPatchesFromThisMonthToTheNext",
        min("NumPatches").over(
            Window
            .partitionBy("ApplicationName")
            .orderBy("Month")
            .rangeBetween(0, 1)
        )
    )
    .filter(col("MinNumPatchesFromThisMonthToTheNext") >= 2)
)

response2.show()

+---------------+-----+----------+-----------------------------------+
|ApplicationName|Month|NumPatches|MinNumPatchesFromThisMonthToTheNext|
+---------------+-----+----------+-----------------------------------+
|   Ubuntu 18.04|    2|         2|                                  2|
|     Windows 10|    1|         3|                                  2|
+---------------+-----+----------+-----------------------------------+

