In [27]:
# Importiamo le librerie necessarie
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, TimestampType
from pyspark.sql.functions import col, year, sum, to_timestamp, count, expr, max, count_distinct, expr, when, avg, min, month, floor, lag, desc
from pyspark.sql import Window
from pyspark.sql.dataframe import DataFrame
import numpy as np

# df_updated = df.withColumns({
#     "Country": when((col("Country") == "America") & (col("Population") > 10000), "North America").otherwise(col("Country")),
#     "Region": when((col("Country") == "America") & (col("Population") > 10000), "NA").otherwise(col("Region"))
# })

# df_updated = df.withColumn(
#     "Country",
#     when(col("Country") == "America", 
#          when(col("Population") > 10000, "North America")
#          .when(col("Population") > 5000, "Central America")
#          .otherwise("South America"))
#     .otherwise(col("Country"))
# )

# df_updated = df.withColumn(
#     "Country",
#     expr("CASE WHEN Country = 'America' AND Population > 10000 THEN 'North America' ELSE Country END")
# )

# CASE 
#     WHEN Country = 'America' AND Population > 10000 THEN 'North America' 
#     WHEN Country = 'America' AND Population > 5000 THEN 'Central America'
#     WHEN Country = 'America' THEN 'South America' 
#     ELSE Country
# END

# response2 = (
#     monthly_water_consumption
#     .withColumn("Year", year(col("Month")))
#     .groupBy(col("HID"), col("Year"))
#     .agg(sum("M3").alias("AnnualM3"))
#     .withColumn("PreviousAnnualM3", lag("AnnualM3").over(
#         Window
#         .partitionBy("HID")
#         .orderBy(col("Year"))
#     ))
#     .filter(col("PreviousAnnualM3") > col("AnnualM3")
# )

# .withColumn(
#     "HighNumberOfCitiesForCountry",
#     when(col("HighNumberOfCitiesForCountry").isNull(), 0)
#     .otherwise(col("HighNumberOfCitiesForCountry"))
# )
    


# Supponiamo che SparkSession sia giÃ  stato creato
ss: SparkSession = SparkSession.builder.appName("PoliSalesAnalysis").getOrCreate()

# Variabili per i percorsi di input e output
# Percorsi dei file di input e output
jupyter = False
if jupyter:
    input_prefix = "/user/s339450/esami/20240912/"
    output_prefix= "/user/s339450/esami/20240912/out/"
else:
    input_prefix = ".\\data\\"
    output_prefix= ".\\out\\"

failures_path = f"{input_prefix}Failures.txt"
production_plans_path = f"{input_prefix}ProductionPlans.txt"
robots_path = f"{input_prefix}Robots.txt"
output_folder_1 = f"{output_prefix}1/"
output_folder_2 = f"{output_prefix}2/"

In [28]:
failures_schema = StructType([
    StructField("RID", StringType(), False),
    StructField("FailureTypeCode", StringType(), False),
    StructField("Date", StringType(), False),
    StructField("Time", StringType(), False),
])

failures: DataFrame = ss.read.load(failures_path,
    format="csv",
    header=False,
    schema=failures_schema,
    sep=",")

# Conversione del timestamp
failures = failures.withColumn(
    "Date",
    to_timestamp(col("Date"), "yyyy/MM/dd")  # Adatta il formato del timestamp
)

# Conversione del timestamp
failures = failures.withColumn(
    "time",
    to_timestamp(col("time"), "HH:mm:ss")  # Adatta il formato del timestamp
)

failures.show()

production_plans_schema = StructType([
    StructField("PlantID", StringType(), False),
    StructField("City", StringType(), False),
    StructField("Country", StringType(), False),
])

production_plans: DataFrame = ss.read.load(production_plans_path,
    format="csv",
    header=False,
    schema=production_plans_schema,
    sep=",")

production_plans.show()

robots_schema = StructType([
    StructField("RID", StringType(), False),
    StructField("PlantID", StringType(), False),
    StructField("IP", StringType(), False)
])

robots: DataFrame = ss.read.load(robots_path,
    format="csv",
    header=False,
    schema=robots_schema,
    sep=",")

robots.show()

+---+---------------+-------------------+-------------------+
|RID|FailureTypeCode|               Date|               time|
+---+---------------+-------------------+-------------------+
|R15|       FCode122|2020-05-01 00:00:00|1970-01-01 06:40:51|
|R15|       FCode122|2020-05-02 00:00:00|1970-01-01 07:00:00|
|R16|       FCode200|2020-06-15 00:00:00|1970-01-01 12:30:25|
|R17|       FCode122|2020-07-20 00:00:00|1970-01-01 14:10:10|
|R18|       FCode300|2020-08-25 00:00:00|1970-01-01 18:50:05|
|R19|       FCode122|2020-09-30 00:00:00|1970-01-01 20:15:30|
|R19|       FCode122|2020-09-30 00:00:00|1970-01-01 20:15:30|
|R20|       FCode122|2020-09-30 00:00:00|1970-01-01 20:15:30|
+---+---------------+-------------------+-------------------+

+-------+-------+-------+
|PlantID|   City|Country|
+-------+-------+-------+
|   PID1|  Turin|  Italy|
|   PID2| Munich|Germany|
|   PID3|Detroit|    USA|
|   PID4|  Tokyo|  Japan|
|   PID5|  Paris| France|
+-------+-------+-------+

+---+-------+-------

# Punto 1

In [29]:
response1 = (
    failures
    .filter(year(col("Date")) == 2020)
    .join(robots, "RID")
    .groupBy("RID", "PlantID")
    .agg(count("*"))
    .filter(col("count(1)") > 1)
    .groupBy("PlantID")
    .agg({})
    .select("PlantID")
)

response1.show()

+-------+
|PlantID|
+-------+
|   PID1|
|   PID5|
+-------+



# Punto 2

In [34]:
response2 = (
    failures
    .filter(year(col("Date")) == 2020)
    .join(robots, "RID")
    .groupBy("RID", "PlantID")
    .agg(count("*"))
    .filter(col("count(1)") > 1)
    .groupBy("PlantID")
    .agg(count("*"))
    .withColumnRenamed("count(1)", "NumOfRobots")
    .join(
        production_plans,
        "PlantID",
        "right"
    )
    .withColumn(
        "NumOfRobots",
        when(col("NumOfRobots").isNull(), 0)
        .otherwise(col("NumOfRobots"))
    )
    .select("PlantID", "NumOfRobots")
)

response2.show()


+-------+-----------+
|PlantID|NumOfRobots|
+-------+-----------+
|   PID1|          1|
|   PID2|          0|
|   PID3|          0|
|   PID4|          0|
|   PID5|          1|
+-------+-----------+

