In [17]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, TimestampType
from pyspark.sql.functions import col, year, sum, to_timestamp, count, expr, max, count_distinct, expr, when, avg, min, month, floor, lag, desc
from pyspark.sql import Window
from pyspark.sql.dataframe import DataFrame
import numpy as np

# df_updated = df.withColumns({
#     "Country": when((col("Country") == "America") & (col("Population") > 10000), "North America").otherwise(col("Country")),
#     "Region": when((col("Country") == "America") & (col("Population") > 10000), "NA").otherwise(col("Region"))
# })

# df_updated = df.withColumn(
#     "Country",
#     when(col("Country") == "America", 
#          when(col("Population") > 10000, "North America")
#          .when(col("Population") > 5000, "Central America")
#          .otherwise("South America"))
#     .otherwise(col("Country"))
# )

# df_updated = df.withColumn(
#     "Country",
#     expr("CASE WHEN Country = 'America' AND Population > 10000 THEN 'North America' ELSE Country END")
# )

# CASE 
#     WHEN Country = 'America' AND Population > 10000 THEN 'North America' 
#     WHEN Country = 'America' AND Population > 5000 THEN 'Central America'
#     WHEN Country = 'America' THEN 'South America' 
#     ELSE Country
# END

# response2 = (
#     monthly_water_consumption
#     .withColumn("Year", year(col("Month")))
#     .groupBy(col("HID"), col("Year"))
#     .agg(sum("M3").alias("AnnualM3"))
#     .withColumn("PreviousAnnualM3", lag("AnnualM3").over(
#         Window
#         .partitionBy("HID")
#         .orderBy(col("Year"))
#     ))
#     .filter(col("PreviousAnnualM3") > col("AnnualM3")
# )

# .withColumn(
#     "HighNumberOfCitiesForCountry",
#     when(col("HighNumberOfCitiesForCountry").isNull(), 0)
#     .otherwise(col("HighNumberOfCitiesForCountry"))
# )
    


# Supponiamo che SparkSession sia già stato creato
ss: SparkSession = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = ".\\data\\"
    output_prefix= ".\\out\\"

companies_path = f"{input_prefix}Companies.txt"
daily_power_consumption_path = f"{input_prefix}DailyPowerConsumption.txt"
data_centers_path = f"{input_prefix}DataCenters.txt"
output_folder_1 = f"{output_prefix}1/"
output_folder_2 = f"{output_prefix}2/"

In [2]:
companies_schema = StructType([
    StructField("CodC", StringType(), False),
    StructField("CompanyName", StringType(), False),
    StructField("Headquarters-Country", StringType(), False)
])

companies: DataFrame = ss.read.load(companies_path,
    format="csv",
    header=False,
    schema=companies_schema,
    sep=",")

companies.show()

data_centers_schema = StructType([
    StructField("CodDC", StringType(), False),
    StructField("CodC", StringType(), False),
    StructField("City", StringType(), False),
    StructField("Country", StringType(), False),
    StructField("Continent", StringType(), False)
])

data_centers: DataFrame = ss.read.load(data_centers_path,
    format="csv",
    header=False,
    schema=data_centers_schema,
    sep=",")

data_centers.show()

daily_power_consumption_schema = StructType([
    StructField("CodDC", StringType(), False),
    StructField("Date", StringType(), False),
    StructField("kWh", IntegerType(), False)
])

daily_power_consumption: DataFrame = ss.read.load(daily_power_consumption_path,
    format="csv",
    header=False,
    schema=daily_power_consumption_schema,
    sep=",")

# Conversione del timestamp
daily_power_consumption = daily_power_consumption.withColumn(
    "Date",
    to_timestamp(col("Date"), "yyyy/MM/dd")  # Adatta il formato del timestamp
)

daily_power_consumption.show()

+----+-----------+--------------------+
|CodC|CompanyName|Headquarters-Country|
+----+-----------+--------------------+
| C12| Databricks|United States of ...|
| C13|     Google|United States of ...|
| C14|  Microsoft|United States of ...|
| C15|     Amazon|United States of ...|
| C16|    Alibaba|               China|
| C17|   Facebook|United States of ...|
| C18|    Twitter|United States of ...|
| C19|      Apple|United States of ...|
| C20|     Oracle|United States of ...|
| C21|        IBM|United States of ...|
+----+-----------+--------------------+

+-----+----+-------------+-------+-------------+
|CodDC|CodC|         City|Country|    Continent|
+-----+----+-------------+-------+-------------+
| DC21| C12|         Nice| France|       Europe|
| DC22| C13|Mountain View|    USA|North America|
| DC23| C14|      Redmond|    USA|North America|
| DC24| C15|      Seattle|    USA|North America|
| DC25| C16|     Hangzhou|  China|         Asia|
| DC26| C17|   Menlo Park|    USA|North America

# Punto 1

In [11]:
response1 = (
    daily_power_consumption
    .groupBy("Date")
    .agg(count("*"))
    .withColumnRenamed("count(1)", "TotNumDC")
)

response2 = (
    daily_power_consumption
    .filter("kWh > 1000")
    .groupBy("Date")
    .agg(count("*"))
    .withColumnRenamed("count(1)", "NumDC")
)

response3 = (
    response2
    .join(
        response1,
        "Date"
    )
    .filter(
        col("NumDc") / col("TotNumDc") >= 0.8
    )
    .select("Date")
)

response3.show()

+-------------------+
|               Date|
+-------------------+
|2021-01-12 00:00:00|
|2020-01-13 00:00:00|
|2021-01-15 00:00:00|
+-------------------+



# Punto 2

In [36]:
response1 = (
    daily_power_consumption
    .filter(year(col("Date")) == 2021)
    .join(data_centers, "CodDC")
    .groupBy("Continent")
    .agg(sum(col("kWh")), count_distinct(col("CodDC")))
    .withColumn(
        "AvgKWh",
        col("sum(kWh)") / col("count(DISTINCT CodDC)")
    )
    .sort(desc("AvgKWh"))
    .select("Continent")
    .head()
)

response2 = (
    data_centers
    .groupBy("Continent")
    .agg(count("*"))
    .withColumnRenamed(
        "count(1)",
        "NumOfDCs"
    )
    .sort(desc("NumOfDCs"))
    .select("Continent")
    .head()
)

df_final: DataFrame = ss.createDataFrame([
    response1,
    response2
])

df_final.distinct().show()



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Big data processing and analytics\.venv\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Big data processing and analytics\.venv\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\marco\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 