In [43]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, TimestampType
from pyspark.sql.functions import col, year, sum, to_timestamp, count, expr, max, count_distinct, expr, when, avg, min, month, floor, lag
from pyspark.sql import Window
from pyspark.sql.dataframe import DataFrame
import numpy as np

# df_updated = df.withColumns({
#     "Country": when((col("Country") == "America") & (col("Population") > 10000), "North America").otherwise(col("Country")),
#     "Region": when((col("Country") == "America") & (col("Population") > 10000), "NA").otherwise(col("Region"))
# })

# df_updated = df.withColumn(
#     "Country",
#     when(col("Country") == "America", 
#          when(col("Population") > 10000, "North America")
#          .when(col("Population") > 5000, "Central America")
#          .otherwise("South America"))
#     .otherwise(col("Country"))
# )

# df_updated = df.withColumn(
#     "Country",
#     expr("CASE WHEN Country = 'America' AND Population > 10000 THEN 'North America' ELSE Country END")
# )

# CASE 
#     WHEN Country = 'America' AND Population > 10000 THEN 'North America' 
#     WHEN Country = 'America' AND Population > 5000 THEN 'Central America'
#     WHEN Country = 'America' THEN 'South America' 
#     ELSE Country
# END

# response2 = (
#     monthly_water_consumption
#     .withColumn("Year", year(col("Month")))
#     .groupBy(col("HID"), col("Year"))
#     .agg(sum("M3").alias("AnnualM3"))
#     .withColumn("PreviousAnnualM3", lag("AnnualM3").over(
#         Window
#         .partitionBy("HID")
#         .orderBy(col("Year"))
#     ))
#     .filter(col("PreviousAnnualM3") > col("AnnualM3")
# )

# .withColumn(
#     "HighNumberOfCitiesForCountry",
#     when(col("HighNumberOfCitiesForCountry").isNull(), 0)
#     .otherwise(col("HighNumberOfCitiesForCountry"))
# )
    


# Supponiamo che SparkSession sia già stato creato
ss: SparkSession = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = ".\\data\\"
    output_prefix= ".\\out\\"

houses_path = f"{input_prefix}Houses.txt"
daily_power_consumption_path = f"{input_prefix}DailyPowerConsumption.txt"
output_folder_1 = f"{output_prefix}1/"
output_folder_2 = f"{output_prefix}2/"

In [44]:
houses_schema = StructType([
    StructField("HouseID", StringType(), False),
    StructField("City", StringType(), False),
    StructField("Country", StringType(), False),
    StructField("SizeSQM", IntegerType(), False)
])

houses: DataFrame = ss.read.load(houses_path,
    format="csv",
    header=False,
    schema=houses_schema,
    sep=",")

houses.show()

daily_power_consumption_schema = StructType([
    StructField("HouseID", StringType(), False),
    StructField("Date", StringType(), False),
    StructField("kWh", IntegerType(), False)
])

daily_power_consumption: DataFrame = ss.read.load(daily_power_consumption_path,
    format="csv",
    header=False,
    schema=daily_power_consumption_schema,
    sep=",")

# Conversione del timestamp
daily_power_consumption = daily_power_consumption.withColumn(
    "Date",
    to_timestamp(col("Date"), "yyyy/MM/dd")  # Adatta il formato del timestamp
)

daily_power_consumption.show()

+--------+-----------+-----------+-------+
| HouseID|       City|    Country|SizeSQM|
+--------+-----------+-----------+-------+
|House101|   New York|        USA|    150|
|House102|      Turin|      Italy|    120|
|House103|      Paris|     France|     80|
|House104|      Tokyo|      Japan|    100|
|House105|     Berlin|    Germany|     90|
|House106|     Sydney|  Australia|    200|
|House107|     London|         UK|    110|
|House108|   New York|        USA|    180|
|House109|     Madrid|      Spain|    130|
|House110|  Amsterdam|Netherlands|    140|
|House111|Los Angeles|        USA|    170|
|House112|Los Angeles|        USA|    170|
+--------+-----------+-----------+-------+

+--------+-------------------+---+
| HouseID|               Date|kWh|
+--------+-------------------+---+
|House101|2022-12-21 00:00:00| 15|
|House101|2022-12-22 00:00:00| 18|
|House102|2022-12-21 00:00:00| 12|
|House102|2022-12-22 00:00:00| 13|
|House103|2022-12-21 00:00:00| 10|
|House103|2022-12-22 00:00:00| 

# Punto 1

In [45]:
country_with_high_power_consumption = (
    daily_power_consumption
    .filter(year(col("Date")) == 2022)
    .join(houses, on="HouseID")
    .groupBy(col("HouseID"), col("Country"))
    .agg(avg(col("kWh")))
    .withColumnRenamed("avg(kWh)", "DailyKWh")
    .filter(col("DailyKWh") > 10)
    .groupBy(col('Country'))
    .agg({})
)

# country_with_high_power_consumption.show()

response1 = (
    houses
    .groupBy("Country")
    .agg({})
    .join(
        country_with_high_power_consumption,
        on="Country",
        how="anti"
    )
)


response1.show()

+-------+
|Country|
+-------+
|Germany|
| France|
+-------+



# Punto 2

In [59]:
response2 = (
    daily_power_consumption
    .filter(year(col("Date")) == 2022)
    .join(houses, on="HouseID")
    .groupBy(col("HouseID"), col("City"), col("Country"))
    .agg(avg(col("kWh")))
    .withColumnRenamed("avg(kWh)", "DailyKWh")
    .filter(col("DailyKWh") > 10)
    .groupBy(col('City'), col("Country"))
    .agg(count("*"))
    .withColumnRenamed("count(1)", "HighHouseForCityCount")
    .filter("HighHouseForCityCount > 1")
    .groupBy("Country")
    .agg(count("*"))
    .withColumnRenamed("count(1)", "HighNumberOfCitiesForCountry")
)

response2 = (
    houses
    .groupBy("Country")
    .agg({})
    .join(
        response2,
        on="Country",
        how="left"
    )
    .withColumn(
        "HighNumberOfCitiesForCountry",
        when(col("HighNumberOfCitiesForCountry"), 0)
        .otherwise(col("HighNumberOfCitiesForCountry"))
    )
)

response2.show()

+-----------+----------------------------+
|    Country|HighNumberOfCitiesForCountry|
+-----------+----------------------------+
|    Germany|                           0|
|     France|                           0|
|      Italy|                           0|
|      Spain|                           0|
|        USA|                           2|
|         UK|                           0|
|      Japan|                           0|
|  Australia|                           0|
|Netherlands|                           0|
+-----------+----------------------------+

