In [23]:
spark.stop()

In [1]:
#Campania

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyproj

# Create a SparkSession
spark = SparkSession.builder.appName("timeseries").config('spark.sql.shuffle.partitions',500).config('spark.driver.maxResultSize', '10G') .config("spark.driver.memory", "32g").config("spark.executor.memory", "16g").config("spark.task.maxFailures", "10").config("spark.executor.instances", "16").config("spark.local.dir", "/afs/enea.it/por/user/nafis/PFS/tmp").getOrCreate()


# Load the data from the CSV file


schema = StructType(
   [StructField('x', DoubleType(), True),
    StructField('y', DoubleType(), True),
    StructField('z', DoubleType(), True),
    StructField('time', TimestampType(), True),
    StructField('c_PM25', DoubleType(), True),
    StructField('c_PM10', DoubleType(), True),
    StructField('c_O3', DoubleType(), True),
    StructField('c_NO2', DoubleType(), True),
    StructField('geometry', StringType(), True),
    StructField('index_right', IntegerType(), True),
    StructField('OBJECTID', IntegerType(), True),
    StructField('COD_REG', IntegerType(), True),
    StructField('COD_PRO', IntegerType(), True),
    StructField('NOME', StringType(), True),
    StructField('SIGLA', StringType(), True),
    StructField('Shape_Leng', DoubleType(), True),
    StructField('Shape_Area', DoubleType(), True),
   ]
  )


df = spark.read.format("csv").load("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/data_chunk/campania_prov/", header=True, schema=schema)

# Select the columns in the desired order
df = df.select( 'x', 'y', 'z', 'time', 'c_PM25', 'c_PM10', 'c_O3', 'c_NO2', 'geometry', 'NOME')


df = df.withColumn("x", df["x"].cast(DoubleType())) \
    .withColumn("y", df["y"].cast(DoubleType())) \
    .withColumn("z", df["z"].cast(DoubleType())) \
    .withColumn("time", df["time"].cast(TimestampType())) \
    .withColumn("c_PM25", df["c_PM25"].cast(DoubleType())) \
    .withColumn("c_PM10", df["c_PM10"].cast(DoubleType())) \
    .withColumn("c_O3", df["c_O3"].cast(DoubleType())) \
    .withColumn("c_NO2", df["c_NO2"].cast(DoubleType())) \
    .withColumn("geometry", df["geometry"].cast(StringType()))\
    .withColumn("NOME", df["NOME"].cast(StringType()))

df = df.withColumnRenamed("c_PM25", "c_PM2_5")

df.show()

df.printSchema()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/06/26 22:38:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/26 22:38:03 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/06/26 22:38:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


[Stage 0:>                                                          (0 + 1) / 1]

+--------+---------+----+-------------------+------------------+------------------+------------------+------------------+--------------------+-------+
|       x|        y|   z|               time|           c_PM2_5|            c_PM10|              c_O3|             c_NO2|            geometry|   NOME|
+--------+---------+----+-------------------+------------------+------------------+------------------+------------------+--------------------+-------+
|902000.0|4576000.0|20.0|2021-05-21 00:00:00|2.5389232999999995|         2.9607124|         102.89293|         2.4104354|POINT (902000 457...|Caserta|
|902000.0|4576000.0|20.0|2021-05-21 01:00:00|         2.5921166|         2.9856164| 98.94353000000001|2.6902790000000003|POINT (902000 457...|Caserta|
|902000.0|4576000.0|20.0|2021-05-21 02:00:00|2.5095107999999997|2.8129014999999997|102.21184000000001|2.9086206000000003|POINT (902000 457...|Caserta|
|902000.0|4576000.0|20.0|2021-05-21 03:00:00|         2.4757087|         2.7655613|          1

                                                                                

In [2]:
from pyspark.sql import functions as F
import pandas as pd

data = df.withColumnRenamed("time", "date_time")


# Convert the "date_time" column to a datetime object
data = data.withColumn("date_time", F.to_timestamp("date_time"))

# Extract the date and time values and store them in separate columns
data = data.withColumn("date", F.date_trunc("day", "date_time"))
data = data.withColumn("time", F.date_trunc("hour", "date_time"))

data = data.withColumnRenamed("date_time", "original_date_time")

# data.show()

In [3]:
from pyspark.sql.functions import col, when

# Create the season column based on the month of the date_column
data = data.withColumn(
    "season",
    when(col("original_date_time").between("2019-01-01 00:00:00", "2019-03-19 23:00:00") | col("original_date_time").between("2019-12-21 00:00:00", "2020-03-19 23:00:00") | col("original_date_time").between("2020-12-21 00:00:00", "2021-03-19 23:00:00") | col("original_date_time").between("2021-12-21 00:00:00", "2022-03-19 23:00:00") | col("original_date_time").between("2022-12-21 00:00:00", "2023-03-19 23:00:00"), "Winter")
    .when(col("original_date_time").between("2019-03-20 00:00:00", "2019-06-20 23:00:00") | col("original_date_time").between("2020-03-20 00:00:00", "2020-06-20 23:00:00") | col("original_date_time").between("2021-03-20 00:00:00", "2021-06-20 23:00:00") | col("original_date_time").between("2022-03-20 00:00:00", "2022-06-20 23:00:00") | col("original_date_time").between("2023-03-20 00:00:00", "2023-06-20 23:00:00"), "Spring")
    .when(col("original_date_time").between("2019-06-21 00:00:00", "2019-09-21 23:00:00") | col("original_date_time").between("2020-06-21 00:00:00", "2020-09-21 23:00:00") | col("original_date_time").between("2021-06-21 00:00:00", "2021-09-21 23:00:00") | col("original_date_time").between("2022-06-21 00:00:00", "2022-09-21 23:00:00") | col("original_date_time").between("2023-06-21 00:00:00", "2023-09-21 23:00:00"), "Summer")
    .when(col("original_date_time").between("2019-09-22 00:00:00", "2019-12-20 23:00:00") | col("original_date_time").between("2020-09-22 00:00:00", "2020-12-20 23:00:00") | col("original_date_time").between("2021-09-22 00:00:00", "2021-12-20 23:00:00") | col("original_date_time").between("2022-09-22 00:00:00", "2022-12-20 23:00:00") | col("original_date_time").between("2023-09-22 00:00:00", "2023-12-20 23:00:00"), "Autumn")
)


# data.show()

In [4]:
dataset = data.select(col("c_PM2_5"), col("c_PM10"), col("c_O3"), col("c_NO2"))
dataset_stat = dataset.describe()
dataset_stat.show()



+-------+-----------------+------------------+------------------+------------------+
|summary|          c_PM2_5|            c_PM10|              c_O3|             c_NO2|
+-------+-----------------+------------------+------------------+------------------+
|  count|         26497200|          26497200|          26497200|          26497200|
|   mean|7.253877547295835| 9.881282553262267|  72.5298536256201|5.9930447874230035|
| stddev|7.244904460091311| 8.722953120667986|20.476166461887257|11.353874650098401|
|    min|     0.0040953453|       0.004988223|      9.947703E-31|        0.03262893|
|    max|        407.13245|424.41900000000004|          474.4772|         286.42676|
+-------+-----------------+------------------+------------------+------------------+



                                                                                

In [5]:
data_PM25 = data.select(col("original_date_time"),col("c_PM2_5"), col("x"), col("y"), col("NOME"))
data_PM25.show()

+-------------------+------------------+--------+---------+-------+
| original_date_time|           c_PM2_5|       x|        y|   NOME|
+-------------------+------------------+--------+---------+-------+
|2021-05-21 00:00:00|2.5389232999999995|902000.0|4576000.0|Caserta|
|2021-05-21 01:00:00|         2.5921166|902000.0|4576000.0|Caserta|
|2021-05-21 02:00:00|2.5095107999999997|902000.0|4576000.0|Caserta|
|2021-05-21 03:00:00|         2.4757087|902000.0|4576000.0|Caserta|
|2021-05-21 04:00:00|2.4431244999999997|902000.0|4576000.0|Caserta|
|2021-05-21 05:00:00|          2.602668|902000.0|4576000.0|Caserta|
|2021-05-21 06:00:00|         2.9900506|902000.0|4576000.0|Caserta|
|2021-05-21 07:00:00|         3.1773403|902000.0|4576000.0|Caserta|
|2021-05-21 08:00:00|2.9301939999999997|902000.0|4576000.0|Caserta|
|2021-05-21 09:00:00|2.4819592999999998|902000.0|4576000.0|Caserta|
|2021-05-21 10:00:00|1.9674328999999997|902000.0|4576000.0|Caserta|
|2021-05-21 11:00:00|         1.7692596|902000.0

In [6]:
data_PM10 = data.select(col("original_date_time"),col("c_PM10"), col("x"), col("y"), col("NOME"))
data_PM10.show()

+-------------------+------------------+--------+---------+-------+
| original_date_time|            c_PM10|       x|        y|   NOME|
+-------------------+------------------+--------+---------+-------+
|2021-05-21 00:00:00|         2.9607124|902000.0|4576000.0|Caserta|
|2021-05-21 01:00:00|         2.9856164|902000.0|4576000.0|Caserta|
|2021-05-21 02:00:00|2.8129014999999997|902000.0|4576000.0|Caserta|
|2021-05-21 03:00:00|         2.7655613|902000.0|4576000.0|Caserta|
|2021-05-21 04:00:00|          2.733494|902000.0|4576000.0|Caserta|
|2021-05-21 05:00:00|         2.9043045|902000.0|4576000.0|Caserta|
|2021-05-21 06:00:00|3.3243476999999997|902000.0|4576000.0|Caserta|
|2021-05-21 07:00:00|          3.527025|902000.0|4576000.0|Caserta|
|2021-05-21 08:00:00|3.2793129999999997|902000.0|4576000.0|Caserta|
|2021-05-21 09:00:00|         2.7998161|902000.0|4576000.0|Caserta|
|2021-05-21 10:00:00|         2.3240533|902000.0|4576000.0|Caserta|
|2021-05-21 11:00:00|         2.1361778|902000.0

In [7]:
data_NO2 = data.select(col("original_date_time"),col("c_NO2"), col("x"), col("y"), col("NOME"))
data_NO2.show()

+-------------------+------------------+--------+---------+-------+
| original_date_time|             c_NO2|       x|        y|   NOME|
+-------------------+------------------+--------+---------+-------+
|2021-05-21 00:00:00|         2.4104354|902000.0|4576000.0|Caserta|
|2021-05-21 01:00:00|2.6902790000000003|902000.0|4576000.0|Caserta|
|2021-05-21 02:00:00|2.9086206000000003|902000.0|4576000.0|Caserta|
|2021-05-21 03:00:00|         3.2857485|902000.0|4576000.0|Caserta|
|2021-05-21 04:00:00|            2.8583|902000.0|4576000.0|Caserta|
|2021-05-21 05:00:00|         3.1135564|902000.0|4576000.0|Caserta|
|2021-05-21 06:00:00|          4.102874|902000.0|4576000.0|Caserta|
|2021-05-21 07:00:00|          4.456797|902000.0|4576000.0|Caserta|
|2021-05-21 08:00:00|          3.982049|902000.0|4576000.0|Caserta|
|2021-05-21 09:00:00|         2.0289207|902000.0|4576000.0|Caserta|
|2021-05-21 10:00:00|        0.98944813|902000.0|4576000.0|Caserta|
|2021-05-21 11:00:00|        0.75310564|902000.0

In [5]:
data_O3 = data.select(col("original_date_time"),col("c_O3"), col("x"), col("y"), col("NOME"))
data_O3.show()

+-------------------+------------------+--------+---------+-------+
| original_date_time|              c_O3|       x|        y|   NOME|
+-------------------+------------------+--------+---------+-------+
|2021-05-21 00:00:00|         102.89293|902000.0|4576000.0|Caserta|
|2021-05-21 01:00:00| 98.94353000000001|902000.0|4576000.0|Caserta|
|2021-05-21 02:00:00|102.21184000000001|902000.0|4576000.0|Caserta|
|2021-05-21 03:00:00|          102.6552|902000.0|4576000.0|Caserta|
|2021-05-21 04:00:00|          99.22969|902000.0|4576000.0|Caserta|
|2021-05-21 05:00:00|         91.063896|902000.0|4576000.0|Caserta|
|2021-05-21 06:00:00|          86.42286|902000.0|4576000.0|Caserta|
|2021-05-21 07:00:00| 92.17261500000001|902000.0|4576000.0|Caserta|
|2021-05-21 08:00:00|         103.37425|902000.0|4576000.0|Caserta|
|2021-05-21 09:00:00|         112.91657|902000.0|4576000.0|Caserta|
|2021-05-21 10:00:00|         117.71718|902000.0|4576000.0|Caserta|
|2021-05-21 11:00:00|         117.36697|902000.0

# Prophet Model

In [6]:
import io, os, sys, setuptools, tokenize
from pyspark.sql.types import StructType,StructField,StringType,TimestampType,ArrayType,DoubleType
from pyspark.sql.functions import current_date
from pyspark.sql.functions import pandas_udf, PandasUDFType
import matplotlib.pyplot as plt
from prophet import Prophet
from datetime import datetime
import pandas as pd
from time import time

In [10]:
# select the "first_name" and "last_name" columns and get distinct values
unique_coordinates = data.select(col("x"), col("y"), col("NOME")).distinct()

# show the unique values
unique_coordinates.show(60)



+---------+---------+---------+
|        x|        y|     NOME|
+---------+---------+---------+
| 978000.0|4520000.0|  Salerno|
| 982000.0|4524000.0|  Salerno|
|1046000.0|4472000.0|  Salerno|
|1034000.0|4564000.0| Avellino|
| 970000.0|4560000.0|Benevento|
|1022000.0|4496000.0|  Salerno|
|1026000.0|4512000.0|  Salerno|
|1026000.0|4532000.0| Avellino|
| 966000.0|4532000.0|   Napoli|
| 978000.0|4564000.0|Benevento|
| 970000.0|4524000.0|  Salerno|
| 974000.0|4520000.0|  Salerno|
| 990000.0|4520000.0|  Salerno|
| 942000.0|4596000.0|  Caserta|
| 926000.0|4540000.0|   Napoli|
|1002000.0|4512000.0|  Salerno|
|1050000.0|4512000.0|  Salerno|
| 958000.0|4568000.0|Benevento|
| 914000.0|4592000.0|  Caserta|
|1002000.0|4540000.0| Avellino|
|1014000.0|4484000.0|  Salerno|
|1078000.0|4480000.0|  Salerno|
|1018000.0|4572000.0| Avellino|
| 958000.0|4592000.0|Benevento|
| 966000.0|4564000.0|Benevento|
| 990000.0|4540000.0| Avellino|
|1018000.0|4544000.0| Avellino|
|1050000.0|4560000.0| Avellino|
|1010000

                                                                                

PM2.5

In [17]:
from pyspark.sql.functions import *
from time import time
import matplotlib.pyplot as plt
import os

data_PM25_prophet = data_PM25.groupBy("NOME", "original_date_time").agg(avg("c_PM2_5").alias("avg_c_PM25"))
data_PM25_prophet.show()

data_PM25_prophet_u = data_PM25_prophet.withColumnRenamed("avg_c_PM25", "y").withColumnRenamed("original_date_time", "ds")

data_PM25_prophet = data_PM25_prophet_u.withColumn("ds", to_timestamp("ds", "yyyy-MM-dd HH:mm:ss")) \
                                       .withColumn("ds", from_utc_timestamp("ds", "Europe/Rome"))

data_PM25_prophet.printSchema()

def train_and_forecast(group):
    
    print(f"Processing group: {group['NOME'].iloc[0]} with {len(group)} data points")
    
    # Initiate the model
    
    # Convert "ds" column to pandas datetime format and set timezone to Europe/Rome
    group["ds"] = pd.to_datetime(group["ds"])
    
    m = Prophet(seasonality_mode="additive",
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=True,
                    interval_width=0.95)
    
    # Fit the model
    m.fit(group[group['ds'] <= cutoff_date])
    
    # Make predictions
    forecast = m.predict(group)

    # Create the directory if it does not exist
    path = '/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/PM25/'

    if not os.path.exists(path):
        os.makedirs(path)
        
    try:
        fig1 = m.plot(forecast)
        fig1.suptitle(f"PM2.5 Concentration Forecast for {group['NOME'].iloc[0]}")
        fig1.legend(['Actual', 'Forecasted'], loc='upper right')
        fig1.subplots_adjust(top=0.9)
      
        fig2 = m.plot_components(forecast)
        fig2.suptitle(f"PM2.5 Concentration Component Forecast for {group['NOME'].iloc[0]}")
        fig2.subplots_adjust(top=0.9)

        #Save the plots as images using Matplotlib's savefig method
        fig1.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/PM25/{group['NOME'].iloc[0]}_prophet_forecast.png")
        fig2.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/PM25/{group['NOME'].iloc[0]}_prophet_components.png")
    
    except Exception as e:
        print(f"Error generating plots for group {group['NOME'].iloc[0]}: {str(e)}")
    
    f_pd = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper','trend', 'yearly', 'weekly', 'daily']].set_index("ds")
 
    group_pd = group[['ds', 'NOME', 'y']].set_index("ds")
    
    result_pd = f_pd.join( group_pd, how = "left")
    result_pd.reset_index(level=0, inplace=True)
    
    result_pd['NOME'] = group['NOME'].iloc[0]
    
    #Return the forecasted results
    return result_pd[['ds', 'NOME', 'y', 'yhat', 'yhat_upper', 'yhat_lower','trend', 'yearly', 'weekly', 'daily']] 

# Define the result schema
result_schema =StructType([
    StructField('ds',TimestampType()),
    StructField('NOME', StringType()),
    StructField('y', DoubleType()),
    StructField('yhat',FloatType()),
    StructField('yhat_upper',FloatType()),
    StructField('yhat_lower',FloatType()),
    StructField('trend',FloatType()),
    StructField('yearly',FloatType()),
    StructField('weekly',FloatType()),
    StructField('daily',FloatType())
  ])

# create train and test data based on cutoff date
cutoff_date = "2021-06-30 23:00:00"

data_PM25_prophet_train = data_PM25_prophet.filter(col('ds') <= cutoff_date)
data_PM25_prophet_test = data_PM25_prophet.filter((col('ds') > cutoff_date))

max_date = data_PM25_prophet_test.agg(max('ds')).collect()[0][0]

period = data_PM25_prophet_test.count()

# Start time
start_time = time()
# Train and forecast by ticker 
spark_forecast_PM25 = data_PM25_prophet.groupBy("NOME").applyInPandas(train_and_forecast, schema=result_schema)
# Processing time
print('The time used for the Spark forecast is ', time()-start_time)

spark_forecast_PM25.show()


                                                                                

+---------+-------------------+------------------+
|     NOME| original_date_time|        avg_c_PM25|
+---------+-------------------+------------------+
|   Napoli|2022-01-24 05:00:00|1.9080459330136983|
|  Salerno|2022-01-24 05:00:00|1.2905052128205123|
| Avellino|2021-05-21 15:00:00|2.5817250942196543|
|  Caserta|2019-01-24 20:00:00| 5.925371732121215|
| Avellino|2019-01-24 01:00:00|2.6685952549132943|
|  Salerno|2021-11-01 00:00:00|1.8152010012499995|
|  Salerno|2019-02-01 19:00:00|3.1785722868589743|
|Benevento|2022-01-08 13:00:00| 3.817865104651163|
|   Napoli|2021-06-11 17:00:00| 5.092540517808218|
|   Napoli|2021-12-06 16:00:00| 4.307729954794521|
|  Salerno|2021-12-30 20:00:00| 4.245515649679492|
| Avellino|2019-11-03 04:00:00|1.8226615982658958|
|  Caserta|2019-02-01 05:00:00|1.5999065662424254|
| Avellino|2021-11-01 02:00:00| 2.165750578439308|
|Benevento|2020-11-21 02:00:00| 2.454471720155038|
|Benevento|2020-11-21 06:00:00|3.3558390837209306|
| Avellino|2021-12-30 12:00:00|

                                                                                

The time used for the Spark forecast is  0.5624914169311523


Processing group: Benevento with 29862 data points                  (0 + 1) / 1]
14:30:43 - cmdstanpy - INFO - Chain [1] start processing
14:30:55 - cmdstanpy - INFO - Chain [1] done processing


+-------------------+---------+------------------+---------+----------+----------+----------+---------+------------+-----------+
|                 ds|     NOME|                 y|     yhat|yhat_upper|yhat_lower|     trend|   yearly|      weekly|      daily|
+-------------------+---------+------------------+---------+----------+----------+----------+---------+------------+-----------+
|2019-01-01 01:00:00|Benevento| 2.169198713953488| 5.738675| 14.395861|-2.7751412|0.44086233|3.5849354| -0.10496267|  1.8178399|
|2019-01-01 02:00:00|Benevento| 2.157824851937985| 5.394815| 14.095106|-3.3022373|0.44883448|3.5882342| -0.08308938|  1.4408357|
|2019-01-01 03:00:00|Benevento|2.0612024728682186|5.0022283| 12.891183|-3.1864626|0.45680666|3.5915477|-0.062140923|  1.0160148|
|2019-01-01 04:00:00|Benevento|1.9697721821705418|4.6698785|  12.81242|-4.2470865| 0.4647788| 3.594876|-0.042122923|  0.6523467|
|2019-01-01 05:00:00|Benevento|2.0178646961240316|4.4730763| 13.041539|-4.6251855|  0.472751| 3.5

Processing group: Salerno with 29862 data points
                                                                                

In [18]:
from pyspark.sql.functions import col
from pyspark.sql.functions import pow, sqrt

# Calculate MAE
mae_PM25 = spark_forecast_PM25.select(col("NOME"), abs(col("y") - col("yhat")).alias("error")) \
                    .groupBy("NOME") \
                    .agg({"error": "mean"}) \
                    .withColumnRenamed("avg(error)", "MAE")

# Calculate MSE
mse_PM25 = spark_forecast_PM25.select(col("NOME"), pow(col("y") - col("yhat"), 2).alias("error")) \
                    .groupBy("NOME") \
                    .agg({"error": "mean"}) \
                    .withColumnRenamed("avg(error)", "MSE")

# Calculate RMSE
rmse_PM25 = spark_forecast_PM25.select(col("NOME"), pow(col("y") - col("yhat"), 2).alias("error")) \
                     .groupBy("NOME") \
                     .agg({"error": "mean"}) \
                     .withColumnRenamed("avg(error)", "MSE") \
                     .withColumn("RMSE", sqrt(col("MSE")).alias("RMSE"))

# Show the results
mae_PM25.show()
mse_PM25.show()
rmse_PM25.show()

14:31:13 - cmdstanpy - INFO - Chain [1] start processing        (55 + 58) / 113]
Processing group: Benevento with 29862 data points                  (0 + 2) / 2]
Processing group: Avellino with 29862 data points
14:31:25 - cmdstanpy - INFO - Chain [1] done processing
14:31:28 - cmdstanpy - INFO - Chain [1] start processing14:31:28 - cmdstanpy - INFO - Chain [1] start processing

Optimization terminated abnormally. Falling back to Newton.
Optimization terminated abnormally. Falling back to Newton.
14:31:30 - cmdstanpy - INFO - Chain [1] start processing
14:31:30 - cmdstanpy - INFO - Chain [1] start processing
14:37:05 - cmdstanpy - INFO - Chain [1] done processing             (0 + 2) / 2]
Processing group: Caserta with 29862 data points
14:37:17 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
14:37:26 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:37:36 - cmdstanpy - INFO - Chain [1] start processing
14:37:54 - c

+---------+------------------+
|     NOME|               MAE|
+---------+------------------+
|Benevento|4.1179194255826905|
|  Salerno|2.6367373284868822|
| Avellino|3.3334214264284214|
|   Napoli| 6.991005137972531|
|  Caserta|4.3487020108438825|
+---------+------------------+



14:40:38 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
Processing group: Avellino with 29862 data points
14:40:45 - cmdstanpy - INFO - Chain [1] start processing
14:40:50 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
14:41:00 - cmdstanpy - INFO - Chain [1] start processing
14:41:01 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
14:41:12 - cmdstanpy - INFO - Chain [1] start processing
14:41:14 - cmdstanpy - INFO - Chain [1] done processing
14:41:23 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Napoli with 29862 data points
14:41:32 - cmdstanpy - INFO - Chain [1] start processing
14:41:46 - cmdstanpy - INFO - Chain [1] done processing
                                                                                

+---------+------------------+
|     NOME|               MSE|
+---------+------------------+
|Benevento| 21.34154225299777|
|  Salerno|12.351801258840519|
| Avellino|16.141038288808694|
|   Napoli| 90.68375816241436|
|  Caserta|  34.5930508905164|
+---------+------------------+



Processing group: Avellino with 29862 data points
14:42:06 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
14:42:06 - cmdstanpy - INFO - Chain [1] start processing
14:42:21 - cmdstanpy - INFO - Chain [1] done processing
14:42:24 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
14:42:30 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
14:42:33 - cmdstanpy - INFO - Chain [1] start processing
14:42:48 - cmdstanpy - INFO - Chain [1] done processing
14:42:48 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:42:58 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
14:43:13 - cmdstanpy - INFO - Chain [1] done processing


+---------+------------------+-----------------+
|     NOME|               MSE|             RMSE|
+---------+------------------+-----------------+
|Benevento| 21.34154225299777|4.619690709668535|
|  Salerno|12.351801258840519| 3.51451294759893|
| Avellino|16.141038288808694|4.017591105228193|
|   Napoli| 90.68375816241436| 9.52280201213983|
|  Caserta|  34.5930508905164|5.881585746252145|
+---------+------------------+-----------------+



                                                                                

In [11]:
#For Long and Short Term Forecast

from pyspark.sql.functions import *
from time import time
import matplotlib.pyplot as plt
import os

data_PM25_prophet = data_PM25.groupBy("NOME", "original_date_time").agg(avg("c_PM2_5").alias("avg_c_PM25"))
data_PM25_prophet.show()

data_PM25_prophet_u = data_PM25_prophet.withColumnRenamed("avg_c_PM25", "y").withColumnRenamed("original_date_time", "ds")

data_PM25_prophet = data_PM25_prophet_u.withColumn("ds", to_timestamp("ds", "yyyy-MM-dd HH:mm:ss")) \
                                       .withColumn("ds", from_utc_timestamp("ds", "Europe/Rome"))

data_PM25_prophet.printSchema()

def train_and_forecast(group):
    
    print(f"Processing group: {group['NOME'].iloc[0]} with {len(group)} data points")
    
    # Initiate the model
    
    # Convert "ds" column to pandas datetime format and set timezone to Europe/Rome
    group["ds"] = pd.to_datetime(group["ds"])
    
    m = Prophet(seasonality_mode="additive",
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=True,
                    interval_width=0.95)
    
    # Fit the model
    m.fit(group[group['ds'] <= cutoff_date])
    
    # Make predictions
    forecast = m.predict(group)

    # Create the directory if it does not exist
    path = '/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/PM25/'

    if not os.path.exists(path):
        os.makedirs(path)
    
   # try:
    #    fig1 = m.plot(forecast)
     #   fig1.suptitle(f"PM2.5 Concentration Forecast for {group['NOME'].iloc[0]}")
     #   fig1.legend(['Actual', 'Forecasted'], loc='upper right')
     #   fig1.subplots_adjust(top=0.9)
        
      #  fig2 = m.plot_components(forecast)
       # fig2.suptitle(f"PM2.5 Concentration Component Forecast for {group['NOME'].iloc[0]}")
       # fig2.subplots_adjust(top=0.9)

        # Save the plots as images using Matplotlib's savefig method
        #fig1.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet/PM25/{group['NOME'].iloc[0]}_prophet_forecast.png")
        #fig2.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet/PM25/{group['NOME'].iloc[0]}_prophet_components.png")
    
    #except Exception as e:
     #   print(f"Error generating plots for group {group['NOME'].iloc[0]}: {str(e)}")
    
    f_pd = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper','trend', 'yearly', 'weekly', 'daily']].set_index("ds")
 
    group_pd = group[['ds', 'NOME', 'y']].set_index("ds")
    
    result_pd = f_pd.join( group_pd, how = "left")
    result_pd.reset_index(level=0, inplace=True)
    
    result_pd['NOME'] = group['NOME'].iloc[0]
    
    #Return the forecasted results
    return result_pd[['ds', 'NOME', 'y', 'yhat', 'yhat_upper', 'yhat_lower','trend', 'yearly', 'weekly', 'daily']] 

# Define the result schema
result_schema =StructType([
    StructField('ds',TimestampType()),
    StructField('NOME', StringType()),
    StructField('y', DoubleType()),
    StructField('yhat',FloatType()),
    StructField('yhat_upper',FloatType()),
    StructField('yhat_lower',FloatType()),
    StructField('trend',FloatType()),
    StructField('yearly',FloatType()),
    StructField('weekly',FloatType()),
    StructField('daily',FloatType())
  ])

# create train and test data based on cutoff date
cutoff_date = "2021-06-30 23:00:00"

data_PM25_prophet_train = data_PM25_prophet.filter(col('ds') <= cutoff_date)
data_PM25_prophet_test = data_PM25_prophet.filter((col('ds') > cutoff_date))

max_date = data_PM25_prophet_test.agg(max('ds')).collect()[0][0]

period = data_PM25_prophet_test.count()

# Start time
start_time = time()
# Train and forecast by ticker 
spark_forecast_PM25 = data_PM25_prophet.groupBy("NOME").applyInPandas(train_and_forecast, schema=result_schema)
# Processing time
print('The time used for the Spark forecast is ', time()-start_time)

#spark_forecast_PM25.show()


                                                                                

+---------+-------------------+------------------+
|     NOME| original_date_time|        avg_c_PM25|
+---------+-------------------+------------------+
|   Napoli|2022-01-24 05:00:00|1.9080459330136983|
|  Salerno|2022-01-24 05:00:00|1.2905052128205123|
| Avellino|2021-05-21 15:00:00|2.5817250942196543|
|  Caserta|2019-01-24 20:00:00| 5.925371732121215|
| Avellino|2019-01-24 01:00:00|2.6685952549132943|
|  Salerno|2021-11-01 00:00:00|1.8152010012499995|
|  Salerno|2019-02-01 19:00:00|3.1785722868589743|
|Benevento|2022-01-08 13:00:00| 3.817865104651163|
|   Napoli|2021-06-11 17:00:00| 5.092540517808218|
|   Napoli|2021-12-06 16:00:00| 4.307729954794521|
|  Salerno|2021-12-30 20:00:00| 4.245515649679492|
| Avellino|2019-11-03 04:00:00|1.8226615982658958|
|  Caserta|2019-02-01 05:00:00|1.5999065662424254|
| Avellino|2021-11-01 02:00:00| 2.165750578439308|
|Benevento|2020-11-21 02:00:00| 2.454471720155038|
|Benevento|2020-11-21 06:00:00|3.3558390837209306|
| Avellino|2021-12-30 12:00:00|

                                                                                

The time used for the Spark forecast is  0.26251983642578125


In [12]:
from pyspark.sql.functions import col, pow, sqrt

# Define cutoff dates
cutoff_dates = {
    "5 Days": "2021-07-05 23:00:00",
    "30 Days": "2021-07-30 23:00:00",
    "90 Days": "2021-09-28 23:00:00",
    "180 Days": "2021-12-27 23:00:00",
    "365 Days": "2022-06-29 23:00:00"
}

# Define a list to store the evaluation results
results = []

# Compute evaluation metrics for each time period and NOME
for period, cutoff in cutoff_dates.items():
    for nome in spark_forecast_PM25.select("NOME").distinct().rdd.flatMap(lambda x: x).collect():
        # Filter the forecast data based on the cutoff date and NOME
        forecast_data_PM25 = spark_forecast_PM25.filter((col("ds") <= cutoff) & (col("NOME") == nome))

        # Calculate MAE
        mae = forecast_data_PM25.select(abs(col("y") - col("yhat")).alias("error")) \
                             .agg({"error": "mean"}) \
                             .withColumnRenamed("avg(error)", "MAE") \
                             .collect()[0]["MAE"]

        # Calculate MSE
        mse = forecast_data_PM25.select(pow(col("y") - col("yhat"), 2).alias("error")) \
                             .agg({"error": "mean"}) \
                             .withColumnRenamed("avg(error)", "MSE") \
                             .collect()[0]["MSE"]

        # Calculate RMSE
        rmse = forecast_data_PM25.select(pow(col("y") - col("yhat"), 2).alias("error")) \
                              .agg({"error": "mean"}) \
                              .withColumnRenamed("avg(error)", "MSE") \
                              .withColumn("RMSE", sqrt(col("MSE")).alias("RMSE")) \
                              .collect()[0]["RMSE"]

        # Calculate coverage
        coverage = forecast_data_PM25.filter((col("y") >= col("yhat_lower")) & (col("y") <= col("yhat_upper"))) \
                                  .count() / forecast_data_PM25.count() * 100

        # Add the results to the list
        results.append((period, nome, mse, rmse, mae, coverage))

# Create a Spark DataFrame from the results list
result_df_PM25 = spark.createDataFrame(results, ["Time", "NOME", "MSE", "RMSE", "MAE", "Coverage"])

# Show the results
result_df_PM25.show(100)

Processing group: Avellino with 29862 data points                   (0 + 2) / 2]
Processing group: Benevento with 29862 data points
07:36:27 - cmdstanpy - INFO - Chain [1] start processing
07:36:27 - cmdstanpy - INFO - Chain [1] start processing
07:36:42 - cmdstanpy - INFO - Chain [1] done processing
07:36:44 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
07:36:50 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
07:36:52 - cmdstanpy - INFO - Chain [1] start processing
07:37:04 - cmdstanpy - INFO - Chain [1] done processing             (0 + 2) / 2]
07:37:05 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
07:37:12 - cmdstanpy - INFO - Chain [1] start processing
07:37:26 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
07:38:02 - cmdstanpy - INFO - Chain [1] start pro

07:50:09 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
07:50:09 - cmdstanpy - INFO - Chain [1] start processing
07:50:24 - cmdstanpy - INFO - Chain [1] done processing
07:50:26 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
07:50:32 - cmdstanpy - INFO - Chain [1] start processing
07:50:34 - cmdstanpy - INFO - Chain [1] start processing
07:50:43 - cmdstanpy - INFO - Chain [1] done processing
07:50:45 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
07:50:52 - cmdstanpy - INFO - Chain [1] start processing
07:51:05 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
07:51:38 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
07:51:38 - cmdstanpy - INFO - Chain [1] start processing
07:51:49 - cmdstanpy - INFO - Chain [1] done

08:04:04 - cmdstanpy - INFO - Chain [1] done processing
08:04:05 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
08:04:12 - cmdstanpy - INFO - Chain [1] start processing
08:04:13 - cmdstanpy - INFO - Chain [1] start processing
08:04:22 - cmdstanpy - INFO - Chain [1] done processing
08:04:24 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
08:04:30 - cmdstanpy - INFO - Chain [1] start processing
08:04:44 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
08:05:18 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
08:05:18 - cmdstanpy - INFO - Chain [1] start processing
08:05:33 - cmdstanpy - INFO - Chain [1] done processing
08:05:35 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
08:05:41 - cmdstanpy - INFO -

08:17:42 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
08:17:45 - cmdstanpy - INFO - Chain [1] start processing
08:17:54 - cmdstanpy - INFO - Chain [1] done processing
08:17:57 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
08:18:05 - cmdstanpy - INFO - Chain [1] start processing
08:18:18 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
08:18:53 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
08:18:53 - cmdstanpy - INFO - Chain [1] start processing
08:19:08 - cmdstanpy - INFO - Chain [1] done processing
08:19:11 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
08:19:16 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
08:19:19 - cmdstanpy - INFO - Chain [1] start processing
08:19:30 - cmdstanp

08:32:05 - cmdstanpy - INFO - Chain [1] done processing
08:32:05 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
08:32:13 - cmdstanpy - INFO - Chain [1] start processing
08:32:27 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points=>                (5 + 2) / 7]
Processing group: Avellino with 29862 data points
08:33:03 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
08:33:03 - cmdstanpy - INFO - Chain [1] start processing
08:33:17 - cmdstanpy - INFO - Chain [1] done processing
08:33:19 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
08:33:25 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
08:33:27 - cmdstanpy - INFO - Chain [1] start processing
08:33:39 - cmdstanpy - INFO - Chain [1] done processing
08:33:40 - cmdstanpy - INFO - Chain [1] done 

08:45:50 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
08:46:04 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points
08:46:38 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
08:46:38 - cmdstanpy - INFO - Chain [1] start processing
08:46:54 - cmdstanpy - INFO - Chain [1] done processing
08:46:55 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
08:47:02 - cmdstanpy - INFO - Chain [1] start processing
08:47:03 - cmdstanpy - INFO - Chain [1] start processing
08:47:15 - cmdstanpy - INFO - Chain [1] done processing
08:47:17 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
08:47:23 - cmdstanpy - INFO - Chain [1] start processing
08:47:37 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
08

09:14:11 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
09:14:11 - cmdstanpy - INFO - Chain [1] start processing
09:14:23 - cmdstanpy - INFO - Chain [1] done processing
09:14:25 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
09:14:31 - cmdstanpy - INFO - Chain [1] start processing
09:14:33 - cmdstanpy - INFO - Chain [1] start processing
09:14:44 - cmdstanpy - INFO - Chain [1] done processing
09:14:45 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
09:14:53 - cmdstanpy - INFO - Chain [1] start processing
09:15:06 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
09:15:40 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
09:15:40 - cmdstanpy - INFO - Chain [1] start processing
09:15:57 - cmdstanpy - INFO - Chain [1] done

09:28:04 - cmdstanpy - INFO - Chain [1] done processing
09:28:07 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
09:28:12 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
09:28:14 - cmdstanpy - INFO - Chain [1] start processing
09:28:25 - cmdstanpy - INFO - Chain [1] done processing
09:28:26 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
09:28:34 - cmdstanpy - INFO - Chain [1] start processing
09:28:49 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points
09:29:22 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
09:29:22 - cmdstanpy - INFO - Chain [1] start processing
09:29:35 - cmdstanpy - INFO - Chain [1] done processing
09:29:39 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
09:29:43 - cmdstanpy

Processing group: Salerno with 29862 data points
09:41:49 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
09:41:52 - cmdstanpy - INFO - Chain [1] start processing
09:42:02 - cmdstanpy - INFO - Chain [1] done processing
09:42:04 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
09:42:12 - cmdstanpy - INFO - Chain [1] start processing
09:42:28 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points=>                (5 + 2) / 7]
Processing group: Avellino with 29862 data points
09:43:07 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
09:43:07 - cmdstanpy - INFO - Chain [1] start processing
09:43:19 - cmdstanpy - INFO - Chain [1] done processing
09:43:22 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
09:43:27 - cmdstanpy - INFO - Chain [1] start proces

09:55:31 - cmdstanpy - INFO - Chain [1] start processing
09:55:42 - cmdstanpy - INFO - Chain [1] done processing
09:55:43 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
09:55:51 - cmdstanpy - INFO - Chain [1] start processing
09:56:04 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
09:56:37 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
09:56:37 - cmdstanpy - INFO - Chain [1] start processing
09:56:52 - cmdstanpy - INFO - Chain [1] done processing
09:56:56 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
09:57:00 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
09:57:04 - cmdstanpy - INFO - Chain [1] start processing
09:57:15 - cmdstanpy - INFO - Chain [1] done processing
09:57:17 - cmdstanpy - INFO - Chain [1] done processing
Processing gr

10:09:38 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
10:09:46 - cmdstanpy - INFO - Chain [1] start processing
10:10:00 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
10:10:36 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
10:10:36 - cmdstanpy - INFO - Chain [1] start processing
10:10:52 - cmdstanpy - INFO - Chain [1] done processing
10:10:54 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
10:11:00 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
10:11:02 - cmdstanpy - INFO - Chain [1] start processing
10:11:11 - cmdstanpy - INFO - Chain [1] done processing
10:11:12 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
10:11:19 - cmdstanpy - INFO - Chain [1] start

10:37:51 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
10:38:26 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
10:38:26 - cmdstanpy - INFO - Chain [1] start processing
10:38:38 - cmdstanpy - INFO - Chain [1] done processing
10:38:40 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
10:38:46 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
10:38:48 - cmdstanpy - INFO - Chain [1] start processing
10:38:58 - cmdstanpy - INFO - Chain [1] done processing
10:38:58 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
10:39:06 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
10:39:19 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
10:39:53 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
10:3

10:52:04 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
10:52:04 - cmdstanpy - INFO - Chain [1] start processing
10:52:20 - cmdstanpy - INFO - Chain [1] done processing
10:52:22 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
10:52:28 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
10:52:30 - cmdstanpy - INFO - Chain [1] start processing
10:52:40 - cmdstanpy - INFO - Chain [1] done processing
10:52:42 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
10:52:50 - cmdstanpy - INFO - Chain [1] start processing
10:53:03 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
10:53:35 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
10:53:35 - cmdstanpy - INFO - Chain [1] start processing
10:53:50 - cmdstanpy - INFO - Chain [1] done

+--------+---------+------------------+------------------+------------------+-----------------+
|    Time|     NOME|               MSE|              RMSE|               MAE|         Coverage|
+--------+---------+------------------+------------------+------------------+-----------------+
|  5 Days|Benevento|18.698557277982307| 4.324182845114474|3.2076624461469576|95.02332089552239|
|  5 Days|  Salerno|10.100288828998792| 3.178095157322825| 2.351939575044667|94.86473880597015|
|  5 Days| Avellino|13.312248485004357|3.6485954126217335| 2.629176152557534|94.90671641791046|
|  5 Days|   Napoli| 74.21461316826144| 8.614790372856524| 6.029997574461622|95.10261194029852|
|  5 Days|  Caserta|30.630310644577804| 5.534465705429731| 4.010426380050726|94.64085820895522|
| 30 Days|Benevento|18.376847937208662| 4.286822592224766|  3.17850779722537|95.19963702359347|
| 30 Days|  Salerno|10.073425511994486| 3.173866019855672|2.3563783997724164|94.94555353901997|
| 30 Days| Avellino|13.118110531523675| 

In [None]:
result_df_PM25_pd = result_df_PM25.pandas_api()
result_df_PM25_pd

PM10

In [19]:
from pyspark.sql.functions import *
from time import time
import matplotlib.pyplot as plt
import os

data_PM10_prophet = data_PM10.groupBy("NOME", "original_date_time").agg(avg("c_PM10").alias("avg_c_PM10"))
data_PM10_prophet.show()

data_PM10_prophet_u = data_PM10_prophet.withColumnRenamed("avg_c_PM10", "y").withColumnRenamed("original_date_time", "ds")
data_PM10_prophet = data_PM10_prophet_u.withColumn("ds", to_timestamp("ds", "yyyy-MM-dd HH:mm:ss")) \
                                       .withColumn("ds", from_utc_timestamp("ds", "Europe/Rome"))

data_PM10_prophet.printSchema()


def train_and_forecast(group):
    
    print(f"Processing group: {group['NOME'].iloc[0]} with {len(group)} data points")
    
    # Initiate the model
    
    # Convert "ds" column to pandas datetime format and set timezone to Europe/Rome
    group["ds"] = pd.to_datetime(group["ds"])

    m = Prophet(seasonality_mode="additive",
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=True,
                    interval_width=0.95)
    #m.add_country_holidays(country_name="IT")
    
    # Fit the model
    m.fit(group[group['ds'] <= cutoff_date])
    
    
    # Make predictions
    forecast = m.predict(group)

    # Create the directory if it does not exist
    path = '/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/PM10/'

    if not os.path.exists(path):
        os.makedirs(path)
    
    try:
        fig1 = m.plot(forecast)
        fig1.suptitle(f"PM10 Concentration Forecast for {group['NOME'].iloc[0]}")
        fig1.legend(['Actual', 'Forecasted'], loc='upper right')
        fig1.subplots_adjust(top=0.9)
        
        fig2 = m.plot_components(forecast)
        fig2.suptitle(f"PM10 Concentration Component Forecast for {group['NOME'].iloc[0]}")
        fig2.subplots_adjust(top=0.9)

        # Save the plots as images using Matplotlib's savefig method
        fig1.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/PM10/{group['NOME'].iloc[0]}_prophet_forecast.png")
        fig2.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/PM10/{group['NOME'].iloc[0]}_prophet_components.png")
    except Exception as e:
        print(f"Error generating plots for group {group['NOME'].iloc[0]}: {str(e)}")
    
    
    f_pd = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper','trend', 'yearly', 'weekly', 'daily']].set_index("ds")
 
    group_pd = group[['ds', 'NOME', 'y']].set_index("ds")
    
    result_pd = f_pd.join( group_pd, how = "left")
    result_pd.reset_index(level=0, inplace=True)
    
    result_pd['NOME'] = group['NOME'].iloc[0]
    
    #Return the forecasted results
    return result_pd[['ds', 'NOME', 'y', 'yhat', 'yhat_upper', 'yhat_lower','trend', 'yearly', 'weekly', 'daily']] 

# Define the restult schema
result_schema =StructType([
    StructField('ds',TimestampType()),
    StructField('NOME', StringType()),
    StructField('y', DoubleType()),
    StructField('yhat',FloatType()),
    StructField('yhat_upper',FloatType()),
    StructField('yhat_lower',FloatType()),
    StructField('trend',FloatType()),
    StructField('yearly',FloatType()),
    StructField('weekly',FloatType()),
    StructField('daily',FloatType())
  ])

# create train and test data based on cutoff date
cutoff_date = "2021-06-30 23:00:00"

data_PM10_prophet_train = data_PM10_prophet.filter(col('ds') <= cutoff_date)
data_PM10_prophet_test = data_PM10_prophet.filter((col('ds') > cutoff_date))

max_date = data_PM10_prophet_test.agg(max('ds')).collect()[0][0]

period = data_PM10_prophet_test.count()

# Start time
start_time = time()
# Train and forecast by ticker 
spark_forecast_PM10 = data_PM10_prophet.groupBy("NOME").applyInPandas(train_and_forecast, schema=result_schema)
# Processing time
print('The time used for the Spark forecast is ', time()-start_time)

spark_forecast_PM10.show()

                                                                                

+---------+-------------------+------------------+
|     NOME| original_date_time|        avg_c_PM10|
+---------+-------------------+------------------+
|   Napoli|2022-01-24 05:00:00|  2.19421393561644|
|  Salerno|2022-01-24 05:00:00| 1.951264651057693|
| Avellino|2021-05-21 15:00:00|2.9013350687861266|
|  Caserta|2019-01-24 20:00:00| 7.254402715757577|
| Avellino|2019-01-24 01:00:00|3.4991190641618517|
|  Salerno|2021-11-01 00:00:00|2.1814328497435893|
|  Salerno|2019-02-01 19:00:00|  5.94134689775641|
|Benevento|2022-01-08 13:00:00| 5.350933993798452|
|   Napoli|2021-06-11 17:00:00|5.7778270589041085|
|   Napoli|2021-12-06 16:00:00| 5.087223741095889|
|  Salerno|2021-12-30 20:00:00| 4.955298324038458|
| Avellino|2019-11-03 04:00:00| 3.701815800578033|
|  Caserta|2019-02-01 05:00:00| 5.011559084242426|
| Avellino|2021-11-01 02:00:00|2.3479436712138733|
|Benevento|2020-11-21 02:00:00| 4.134004267441858|
|Benevento|2020-11-21 06:00:00| 6.151227803100771|
| Avellino|2021-12-30 12:00:00|

                                                                                

The time used for the Spark forecast is  0.04328560829162598


Processing group: Benevento with 29862 data points>             (85 + 28) / 113]
14:43:35 - cmdstanpy - INFO - Chain [1] start processing            (0 + 1) / 1]
14:43:49 - cmdstanpy - INFO - Chain [1] done processing


+-------------------+---------+------------------+---------+----------+----------+---------+---------+-----------+-----------+
|                 ds|     NOME|                 y|     yhat|yhat_upper|yhat_lower|    trend|   yearly|     weekly|      daily|
+-------------------+---------+------------------+---------+----------+----------+---------+---------+-----------+-----------+
|2019-01-01 01:00:00|Benevento| 6.449259526356591| 7.895175|   19.7369|-3.3848724|2.1233678|3.7960894|0.123790376|  1.8519274|
|2019-01-01 02:00:00|Benevento| 6.512501393023254| 7.545446| 17.923363|-3.6291122|2.1319215|3.7946844| 0.14671066|  1.4721295|
|2019-01-01 03:00:00|Benevento| 6.333786593023259|7.1371117| 18.484673| -4.349808| 2.140475|3.7933025| 0.16871823|  1.0346159|
|2019-01-01 04:00:00|Benevento|5.8128533441860455| 6.768056| 17.984974| -4.086628|2.1490285|3.7919436|  0.1897584|  0.6373255|
|2019-01-01 05:00:00|Benevento| 5.554195050387596| 6.526956| 17.954464|-5.3307576| 2.157582| 3.790608| 0.209776

Processing group: Salerno with 29862 data points
                                                                                

In [20]:
from pyspark.sql.functions import col
from pyspark.sql.functions import pow, sqrt
from prophet.diagnostics import performance_metrics


# Calculate MAE
mae_PM10 = spark_forecast_PM10.select(col("NOME"), abs(col("y") - col("yhat")).alias("error")) \
                    .groupBy("NOME") \
                    .agg({"error": "mean"}) \
                    .withColumnRenamed("avg(error)", "MAE")


# Calculate MSE
mse_PM10 = spark_forecast_PM10.select(col("NOME"), pow(col("y") - col("yhat"), 2).alias("error")) \
                    .groupBy("NOME") \
                    .agg({"error": "mean"}) \
                    .withColumnRenamed("avg(error)", "MSE")

# Calculate RMSE
rmse_PM10 = spark_forecast_PM10.select(col("NOME"), pow(col("y") - col("yhat"), 2).alias("error")) \
                     .groupBy("NOME") \
                     .agg({"error": "mean"}) \
                     .withColumnRenamed("avg(error)", "MSE") \
                     .withColumn("RMSE", sqrt(col("MSE")).alias("RMSE"))

# Show the results
mae_PM10.show()
mse_PM10.show()
rmse_PM10.show()

14:43:59 - cmdstanpy - INFO - Chain [1] start processing        (58 + 55) / 113]
Processing group: Benevento with 29862 data points==>           (88 + 25) / 113]
14:44:02 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
Processing group: Avellino with 29862 data points
14:44:14 - cmdstanpy - INFO - Chain [1] done processing
14:44:21 - cmdstanpy - INFO - Chain [1] start processing
Optimization terminated abnormally. Falling back to Newton.
14:44:23 - cmdstanpy - INFO - Chain [1] start processing
14:44:23 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
14:44:33 - cmdstanpy - INFO - Chain [1] start processing
14:44:46 - cmdstanpy - INFO - Chain [1] done processing
14:49:00 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Caserta with 29862 data points
14:49:15 - cmdstanpy - INFO - Chain [1] start processing
14:49:25 - cmdstanpy - INFO - Chain [1] done processing
Processing group

+---------+------------------+
|     NOME|               MAE|
+---------+------------------+
|Benevento| 4.158243410603779|
|  Salerno|3.4816548071620828|
| Avellino|4.5388499573042145|
|   Napoli| 7.064490407417741|
|  Caserta| 5.072139471964799|
+---------+------------------+



14:50:02 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
Processing group: Avellino with 29862 data points
14:50:12 - cmdstanpy - INFO - Chain [1] start processing
14:50:19 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
14:50:27 - cmdstanpy - INFO - Chain [1] done processing
14:50:28 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
14:50:37 - cmdstanpy - INFO - Chain [1] start processing
14:50:43 - cmdstanpy - INFO - Chain [1] done processing
14:50:52 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
14:51:03 - cmdstanpy - INFO - Chain [1] start processing
14:51:15 - cmdstanpy - INFO - Chain [1] done processing
                                                                                

+---------+------------------+
|     NOME|               MSE|
+---------+------------------+
|Benevento| 37.82701124247217|
|  Salerno| 31.22884880618204|
| Avellino|30.921080037702307|
|   Napoli|  97.4481522058115|
|  Caserta| 52.57019428466397|
+---------+------------------+



Processing group: Avellino with 29862 data points===>           (89 + 24) / 113]
Processing group: Benevento with 29862 data points
14:51:29 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
14:51:29 - cmdstanpy - INFO - Chain [1] start processing
14:51:46 - cmdstanpy - INFO - Chain [1] done processing
14:51:46 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Processing group: Salerno with 29862 data points
14:51:55 - cmdstanpy - INFO - Chain [1] start processing
14:51:56 - cmdstanpy - INFO - Chain [1] start processing
14:52:10 - cmdstanpy - INFO - Chain [1] done processing
14:52:14 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:52:21 - cmdstanpy - INFO - Chain [1] start processing
14:52:34 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]


+---------+------------------+------------------+
|     NOME|               MSE|              RMSE|
+---------+------------------+------------------+
|Benevento| 37.82701124247217|  6.15036675674485|
|  Salerno| 31.22884880618204|5.5882778032397455|
| Avellino|30.921080037702307| 5.560672624575403|
|   Napoli|  97.4481522058115| 9.871583064828634|
|  Caserta| 52.57019428466397| 7.250530620903822|
+---------+------------------+------------------+



                                                                                

In [13]:
#For Long and Short Term Forecast

from pyspark.sql.functions import *
from time import time
import matplotlib.pyplot as plt
import os

data_PM10_prophet = data_PM10.groupBy("NOME", "original_date_time").agg(avg("c_PM10").alias("avg_c_PM10"))
data_PM10_prophet.show()

data_PM10_prophet_u = data_PM10_prophet.withColumnRenamed("avg_c_PM10", "y").withColumnRenamed("original_date_time", "ds")
data_PM10_prophet = data_PM10_prophet_u.withColumn("ds", to_timestamp("ds", "yyyy-MM-dd HH:mm:ss")) \
                                       .withColumn("ds", from_utc_timestamp("ds", "Europe/Rome"))

#data_PM10_prophet.printSchema()


def train_and_forecast(group):
    
    print(f"Processing group: {group['NOME'].iloc[0]} with {len(group)} data points")
    
    # Initiate the model
    
    # Convert "ds" column to pandas datetime format and set timezone to Europe/Rome
    group["ds"] = pd.to_datetime(group["ds"])

    m = Prophet(seasonality_mode="additive",
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=True,
                    interval_width=0.95)
    #m.add_country_holidays(country_name="IT")
    
    # Fit the model
    m.fit(group[group['ds'] <= cutoff_date])
    
    
    # Make predictions
    forecast = m.predict(group)

    # Create the directory if it does not exist
    path = '/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/PM10/'

    if not os.path.exists(path):
        os.makedirs(path)
    
    #try:
     #   fig1 = m.plot(forecast)
      #  fig1.suptitle(f"PM10 Concentration Forecast for {group['NOME'].iloc[0]}")
       # fig1.legend(['Actual', 'Forecasted'], loc='upper right')
       # fig1.subplots_adjust(top=0.9)
        
       # fig2 = m.plot_components(forecast)
       # fig2.suptitle(f"PM10 Concentration Component Forecast for {group['NOME'].iloc[0]}")
       # fig2.subplots_adjust(top=0.9)

        # Save the plots as images using Matplotlib's savefig method
        #fig1.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet/PM10/{group['NOME'].iloc[0]}_prophet_forecast.png")
       # fig2.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet/PM10/{group['NOME'].iloc[0]}_prophet_components.png")
   # except Exception as e:
   #     print(f"Error generating plots for group {group['NOME'].iloc[0]}: {str(e)}")
    
    
    f_pd = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper','trend', 'yearly', 'weekly', 'daily']].set_index("ds")
 
    group_pd = group[['ds', 'NOME', 'y']].set_index("ds")
    
    result_pd = f_pd.join( group_pd, how = "left")
    result_pd.reset_index(level=0, inplace=True)
    
    result_pd['NOME'] = group['NOME'].iloc[0]
    
    #Return the forecasted results
    return result_pd[['ds', 'NOME', 'y', 'yhat', 'yhat_upper', 'yhat_lower','trend', 'yearly', 'weekly', 'daily']] 

# Define the restult schema
result_schema =StructType([
    StructField('ds',TimestampType()),
    StructField('NOME', StringType()),
    StructField('y', DoubleType()),
    StructField('yhat',FloatType()),
    StructField('yhat_upper',FloatType()),
    StructField('yhat_lower',FloatType()),
    StructField('trend',FloatType()),
    StructField('yearly',FloatType()),
    StructField('weekly',FloatType()),
    StructField('daily',FloatType())
  ])

# create train and test data based on cutoff date
cutoff_date = "2021-06-30 23:00:00"

data_PM10_prophet_train = data_PM10_prophet.filter(col('ds') <= cutoff_date)
data_PM10_prophet_test = data_PM10_prophet.filter((col('ds') > cutoff_date))

max_date = data_PM10_prophet_test.agg(max('ds')).collect()[0][0]

period = data_PM10_prophet_test.count()

# Start time
start_time = time()
# Train and forecast by ticker 
spark_forecast_PM10 = data_PM10_prophet.groupBy("NOME").applyInPandas(train_and_forecast, schema=result_schema)
# Processing time
print('The time used for the Spark forecast is ', time()-start_time)

#spark_forecast_PM10.show()

                                                                                

+---------+-------------------+------------------+
|     NOME| original_date_time|        avg_c_PM10|
+---------+-------------------+------------------+
|   Napoli|2022-01-24 05:00:00|  2.19421393561644|
|  Salerno|2022-01-24 05:00:00| 1.951264651057693|
| Avellino|2021-05-21 15:00:00|2.9013350687861266|
|  Caserta|2019-01-24 20:00:00| 7.254402715757577|
| Avellino|2019-01-24 01:00:00|3.4991190641618517|
|  Salerno|2021-11-01 00:00:00|2.1814328497435893|
|  Salerno|2019-02-01 19:00:00|  5.94134689775641|
|Benevento|2022-01-08 13:00:00| 5.350933993798452|
|   Napoli|2021-06-11 17:00:00|5.7778270589041085|
|   Napoli|2021-12-06 16:00:00| 5.087223741095889|
|  Salerno|2021-12-30 20:00:00| 4.955298324038458|
| Avellino|2019-11-03 04:00:00| 3.701815800578033|
|  Caserta|2019-02-01 05:00:00| 5.011559084242426|
| Avellino|2021-11-01 02:00:00|2.3479436712138733|
|Benevento|2020-11-21 02:00:00| 4.134004267441858|
|Benevento|2020-11-21 06:00:00| 6.151227803100771|
| Avellino|2021-12-30 12:00:00|



The time used for the Spark forecast is  0.03779244422912598


                                                                                

In [14]:
from pyspark.sql.functions import col, pow, sqrt

# Define cutoff dates
cutoff_dates = {
    "5 Days": "2021-07-05 23:00:00",
    "30 Days": "2021-07-30 23:00:00",
    "90 Days": "2021-09-28 23:00:00",
    "180 Days": "2021-12-27 23:00:00",
    "365 Days": "2022-06-29 23:00:00"
}

# Define a list to store the evaluation results
results = []

# Compute evaluation metrics for each time period and NOME
for period, cutoff in cutoff_dates.items():
    for nome in spark_forecast_PM10.select("NOME").distinct().rdd.flatMap(lambda x: x).collect():
        # Filter the forecast data based on the cutoff date and NOME
        forecast_data_PM10 = spark_forecast_PM10.filter((col("ds") <= cutoff) & (col("NOME") == nome))

        # Calculate MAE
        mae = forecast_data_PM10.select(abs(col("y") - col("yhat")).alias("error")) \
                             .agg({"error": "mean"}) \
                             .withColumnRenamed("avg(error)", "MAE") \
                             .collect()[0]["MAE"]

        # Calculate MSE
        mse = forecast_data_PM10.select(pow(col("y") - col("yhat"), 2).alias("error")) \
                             .agg({"error": "mean"}) \
                             .withColumnRenamed("avg(error)", "MSE") \
                             .collect()[0]["MSE"]

        # Calculate RMSE
        rmse = forecast_data_PM10.select(pow(col("y") - col("yhat"), 2).alias("error")) \
                              .agg({"error": "mean"}) \
                              .withColumnRenamed("avg(error)", "MSE") \
                              .withColumn("RMSE", sqrt(col("MSE")).alias("RMSE")) \
                              .collect()[0]["RMSE"]

        # Calculate coverage
        coverage = forecast_data_PM10.filter((col("y") >= col("yhat_lower")) & (col("y") <= col("yhat_upper"))) \
                                  .count() / forecast_data_PM10.count() * 100

        # Add the results to the list
        results.append((period, nome, mse, rmse, mae, coverage))

# Create a Spark DataFrame from the results list
result_df_PM10 = spark.createDataFrame(results, ["Time", "NOME", "MSE", "RMSE", "MAE", "Coverage"])

# Show the results
result_df_PM10.show(100)

Processing group: Benevento with 29862 data points                  (0 + 2) / 2]
Processing group: Avellino with 29862 data points
10:57:02 - cmdstanpy - INFO - Chain [1] start processing
10:57:02 - cmdstanpy - INFO - Chain [1] start processing
10:57:16 - cmdstanpy - INFO - Chain [1] done processing
10:57:16 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
10:57:23 - cmdstanpy - INFO - Chain [1] start processing
10:57:24 - cmdstanpy - INFO - Chain [1] start processing
10:57:39 - cmdstanpy - INFO - Chain [1] done processing             (0 + 2) / 2]
10:57:39 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
10:57:47 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
10:57:58 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points
10:58:32 - cmdstanpy - INFO - Chain [1] start processing  

11:23:56 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Processing group: Salerno with 29862 data points
11:24:02 - cmdstanpy - INFO - Chain [1] start processing
11:24:03 - cmdstanpy - INFO - Chain [1] start processing
11:24:15 - cmdstanpy - INFO - Chain [1] done processing
11:24:16 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
11:24:23 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
11:24:34 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points==>               (5 + 2) / 7]
Processing group: Avellino with 29862 data points
11:25:12 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
11:25:12 - cmdstanpy - INFO - Chain [1] start processing
11:25:28 - cmdstanpy - INFO - Chain [1] done processing
11:25:29 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Proce

11:37:23 - cmdstanpy - INFO - Chain [1] start processing
11:37:35 - cmdstanpy - INFO - Chain [1] done processing
11:37:39 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
11:37:43 - cmdstanpy - INFO - Chain [1] start processing
11:37:55 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
11:38:29 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
11:38:29 - cmdstanpy - INFO - Chain [1] start processing
11:38:46 - cmdstanpy - INFO - Chain [1] done processing
11:38:47 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Processing group: Salerno with 29862 data points
11:38:54 - cmdstanpy - INFO - Chain [1] start processing
11:38:55 - cmdstanpy - INFO - Chain [1] start processing
11:39:06 - cmdstanpy - INFO - Chain [1] done processing
11:39:09 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Nap

11:50:54 - cmdstanpy - INFO - Chain [1] start processing
11:51:05 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
11:51:42 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
11:51:42 - cmdstanpy - INFO - Chain [1] start processing
11:51:59 - cmdstanpy - INFO - Chain [1] done processing
11:52:00 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Processing group: Salerno with 29862 data points
11:52:07 - cmdstanpy - INFO - Chain [1] start processing
11:52:07 - cmdstanpy - INFO - Chain [1] start processing
11:52:20 - cmdstanpy - INFO - Chain [1] done processing
11:52:23 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
11:52:28 - cmdstanpy - INFO - Chain [1] start processing
11:52:39 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
11

12:18:01 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
12:18:01 - cmdstanpy - INFO - Chain [1] start processing
12:18:17 - cmdstanpy - INFO - Chain [1] done processing
12:18:19 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Processing group: Salerno with 29862 data points
12:18:25 - cmdstanpy - INFO - Chain [1] start processing
12:18:27 - cmdstanpy - INFO - Chain [1] start processing
12:18:38 - cmdstanpy - INFO - Chain [1] done processing
12:18:40 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
12:18:46 - cmdstanpy - INFO - Chain [1] start processing
12:18:56 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Benevento with 29862 data points==>               (5 + 2) / 7]
Processing group: Avellino with 29862 data points
12:19:31 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
12:19:31 - cmdstanpy - INFO 

12:31:35 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
12:31:42 - cmdstanpy - INFO - Chain [1] start processing
12:31:42 - cmdstanpy - INFO - Chain [1] start processing
12:31:54 - cmdstanpy - INFO - Chain [1] done processing
12:31:55 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
12:32:03 - cmdstanpy - INFO - Chain [1] start processing
12:32:13 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
12:32:47 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
12:32:47 - cmdstanpy - INFO - Chain [1] start processing
12:33:02 - cmdstanpy - INFO - Chain [1] done processing
12:33:03 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
12:33:10 - cmdstanpy - INFO 

12:44:52 - cmdstanpy - INFO - Chain [1] start processing
12:45:02 - cmdstanpy - INFO - Chain [1] done processing
12:45:05 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
12:45:09 - cmdstanpy - INFO - Chain [1] start processing
12:45:22 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
12:45:57 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
12:45:57 - cmdstanpy - INFO - Chain [1] start processing
12:46:10 - cmdstanpy - INFO - Chain [1] done processing
12:46:11 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
12:46:18 - cmdstanpy - INFO - Chain [1] start processing
12:46:18 - cmdstanpy - INFO - Chain [1] start processing
12:46:30 - cmdstanpy - INFO - Chain [1] done processing
12:46:32 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Nap

Processing group: Napoli with 29862 data points
12:58:24 - cmdstanpy - INFO - Chain [1] start processing
12:58:35 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
12:59:10 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
12:59:10 - cmdstanpy - INFO - Chain [1] start processing
12:59:26 - cmdstanpy - INFO - Chain [1] done processing
12:59:28 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
12:59:34 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
12:59:36 - cmdstanpy - INFO - Chain [1] start processing
12:59:44 - cmdstanpy - INFO - Chain [1] done processing
12:59:48 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
12:59:52 - cmdstanpy - INFO - Chain [1] start processing
13:00:05 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Proc

Processing group: Avellino with 29862 data points
13:12:07 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
13:12:07 - cmdstanpy - INFO - Chain [1] start processing
13:12:22 - cmdstanpy - INFO - Chain [1] done processing
13:12:23 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Processing group: Salerno with 29862 data points
13:12:29 - cmdstanpy - INFO - Chain [1] start processing
13:12:30 - cmdstanpy - INFO - Chain [1] start processing
13:12:39 - cmdstanpy - INFO - Chain [1] done processing
13:12:42 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
13:12:47 - cmdstanpy - INFO - Chain [1] start processing
13:12:58 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Benevento with 29862 data points==>               (5 + 2) / 7]
Processing group: Avellino with 29862 data points
13:13:31 - cmdstanpy - INFO - Chain [1] start processing   

Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
13:38:54 - cmdstanpy - INFO - Chain [1] start processing
13:38:54 - cmdstanpy - INFO - Chain [1] start processing
13:39:08 - cmdstanpy - INFO - Chain [1] done processing
13:39:08 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
13:39:15 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
13:39:27 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
13:40:00 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
13:40:00 - cmdstanpy - INFO - Chain [1] start processing
13:40:17 - cmdstanpy - INFO - Chain [1] done processing
13:40:17 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Processing group: Salerno with 29862 data points
13:40:25 - cmdstanpy - INFO - Chain [1] start processing
13:40:25 - cmdstanpy - INFO - Chain 

13:52:14 - cmdstanpy - INFO - Chain [1] done processing
13:52:15 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
13:52:22 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
13:52:34 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points
13:53:10 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
13:53:10 - cmdstanpy - INFO - Chain [1] start processing
13:53:22 - cmdstanpy - INFO - Chain [1] done processing
13:53:23 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Processing group: Salerno with 29862 data points
13:53:29 - cmdstanpy - INFO - Chain [1] start processing
13:53:31 - cmdstanpy - INFO - Chain [1] start processing
13:53:39 - cmdstanpy - INFO - Chain [1] done processing
13:53:41 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
13:53:47 - cmdstanpy - INFO - 

14:05:41 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Benevento with 29862 data points==>               (5 + 2) / 7]
Processing group: Avellino with 29862 data points
14:06:17 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
14:06:17 - cmdstanpy - INFO - Chain [1] start processing
14:06:33 - cmdstanpy - INFO - Chain [1] done processing
14:06:34 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Processing group: Salerno with 29862 data points
14:06:41 - cmdstanpy - INFO - Chain [1] start processing
14:06:42 - cmdstanpy - INFO - Chain [1] start processing
14:06:52 - cmdstanpy - INFO - Chain [1] done processing
14:06:55 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:07:00 - cmdstanpy - INFO - Chain [1] start processing
14:07:11 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
                            

+--------+---------+------------------+------------------+------------------+-----------------+
|    Time|     NOME|               MSE|              RMSE|               MAE|         Coverage|
+--------+---------+------------------+------------------+------------------+-----------------+
|  5 Days|Benevento|  33.1437854726761| 5.757063962878656|3.9214986767209346|96.56716417910448|
|  5 Days|  Salerno|27.818981700451072| 5.274370265771172|3.2389082039797583| 97.2714552238806|
|  5 Days| Avellino|27.625060951434666| 5.255954808732155| 3.362473053107414|97.08022388059702|
|  5 Days|   Napoli|  90.6881208900541| 9.523031076818667|6.6225859088951005|95.47108208955224|
|  5 Days|  Caserta| 46.66384959965485| 6.831094319335289| 4.773255586930724| 95.8115671641791|
| 30 Days|Benevento|  32.4904415659933| 5.700038733727456|3.8766033526109696| 96.6016333938294|
| 30 Days|  Salerno|27.251442744245967| 5.220291442462381| 3.202910617558943|97.36388384754991|
| 30 Days| Avellino|27.050433235360117| 

In [None]:
result_df_PM10_pd = result_df_PM10.pandas_api()
result_df_PM10_pd

NO2

In [21]:
from pyspark.sql.functions import *
from time import time
import matplotlib.pyplot as plt
import os

data_NO2_prophet = data_NO2.groupBy("NOME", "original_date_time").agg(avg("c_NO2").alias("avg_c_NO2"))
data_NO2_prophet.show()

data_NO2_prophet_u = data_NO2_prophet.withColumnRenamed("avg_c_NO2", "y").withColumnRenamed("original_date_time", "ds")

data_NO2_prophet = data_NO2_prophet_u.withColumn("ds", to_timestamp("ds", "yyyy-MM-dd HH:mm:ss")) \
                                       .withColumn("ds", from_utc_timestamp("ds", "Europe/Rome"))

data_NO2_prophet.printSchema()

def train_and_forecast(group):
    
    print(f"Processing group: {group['NOME'].iloc[0]} with {len(group)} data points")
    
    group["ds"] = pd.to_datetime(group["ds"])

    m = Prophet(seasonality_mode="additive",
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=True,
                    interval_width=0.95)
    
    m.fit(group[group['ds'] <= cutoff_date])
    
    forecast = m.predict(group)
    
    path = '/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/NO2/'

    if not os.path.exists(path):
        os.makedirs(path)
    
    try:
        fig1 = m.plot(forecast)
        fig1.suptitle(f"NO2 Concentration Forecast for {group['NOME'].iloc[0]}")
        fig1.legend(['Actual', 'Forecasted'], loc='upper right')
        fig1.subplots_adjust(top=0.9)
        
        fig2 = m.plot_components(forecast)
        fig2.suptitle(f"NO2 Concentration Component Forecast for {group['NOME'].iloc[0]}")
        fig2.subplots_adjust(top=0.9)

        fig1.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/NO2/{group['NOME'].iloc[0]}_prophet_forecast.png")
        fig2.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/NO2/{group['NOME'].iloc[0]}_prophet_components.png")
    except Exception as e:
        print(f"Error generating plots for group {group['NOME'].iloc[0]}: {str(e)}")
    
    
    f_pd = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper','trend', 'yearly', 'weekly', 'daily']].set_index("ds")
 
    group_pd = group[['ds', 'NOME', 'y']].set_index("ds")
    
    result_pd = f_pd.join( group_pd, how = "left")
    result_pd.reset_index(level=0, inplace=True)
    
    result_pd['NOME'] = group['NOME'].iloc[0]
    
    return result_pd[['ds', 'NOME', 'y', 'yhat', 'yhat_upper', 'yhat_lower','trend', 'yearly', 'weekly', 'daily']] 

result_schema =StructType([
    StructField('ds',TimestampType()),
    StructField('NOME', StringType()),
    StructField('y', DoubleType()),
    StructField('yhat',FloatType()),
    StructField('yhat_upper',FloatType()),
    StructField('yhat_lower',FloatType()),
    StructField('trend',FloatType()),
    StructField('yearly',FloatType()),
    StructField('weekly',FloatType()),
    StructField('daily',FloatType())
  ])

# create train and test data based on cutoff date
cutoff_date = "2021-06-30 23:00:00"

data_NO2_prophet_train = data_NO2_prophet.filter(col('ds') <= cutoff_date)
data_NO2_prophet_test = data_NO2_prophet.filter((col('ds') > cutoff_date))

max_date = data_NO2_prophet_test.agg(max('ds')).collect()[0][0]

period = data_NO2_prophet_test.count()

# Start time
start_time = time()
# Train and forecast by ticker 
spark_forecast_NO2 = data_NO2_prophet.groupBy("NOME").applyInPandas(train_and_forecast, schema=result_schema)
# Processing time
print('The time used for the Spark forecast is ', time()-start_time)

spark_forecast_NO2.show()


                                                                                

+---------+-------------------+------------------+
|     NOME| original_date_time|         avg_c_NO2|
+---------+-------------------+------------------+
|   Napoli|2022-01-24 05:00:00|2.4796764530136994|
|  Salerno|2022-01-24 05:00:00|0.7855931627564102|
| Avellino|2021-05-21 15:00:00|2.2238616741040476|
|  Caserta|2019-01-24 20:00:00|11.365544946666663|
| Avellino|2019-01-24 01:00:00|3.3922221037572227|
|  Salerno|2021-11-01 00:00:00|1.3968887011217943|
|  Salerno|2019-02-01 19:00:00| 4.649749127403843|
|Benevento|2022-01-08 13:00:00|1.3910119166666668|
|   Napoli|2021-06-11 17:00:00|  16.4437593479452|
|   Napoli|2021-12-06 16:00:00|12.150923263013699|
|  Salerno|2021-12-30 20:00:00|3.7652926806089737|
| Avellino|2019-11-03 04:00:00|0.7913184238150286|
|  Caserta|2019-02-01 05:00:00|  2.41413039939394|
| Avellino|2021-11-01 02:00:00| 1.461723894450867|
|Benevento|2020-11-21 02:00:00|1.3438488387596899|
|Benevento|2020-11-21 06:00:00| 1.129760739534884|
| Avellino|2021-12-30 12:00:00|

                                                                                

The time used for the Spark forecast is  0.03812456130981445


Processing group: Benevento with 29862 data points=>            (86 + 27) / 113]
14:52:55 - cmdstanpy - INFO - Chain [1] start processing            (0 + 1) / 1]
14:53:02 - cmdstanpy - INFO - Chain [1] done processing


+-------------------+---------+------------------+---------+----------+-----------+---------+---------+----------+----------+
|                 ds|     NOME|                 y|     yhat|yhat_upper| yhat_lower|    trend|   yearly|    weekly|     daily|
+-------------------+---------+------------------+---------+----------+-----------+---------+---------+----------+----------+
|2019-01-01 01:00:00|Benevento|1.4448653317829456|10.110842| 18.042543|  2.4420707| 3.983498|2.0012298|0.96626616| 3.1598482|
|2019-01-01 02:00:00|Benevento|1.3546963558139544| 9.059948| 16.599417|   1.131604|3.9851723|1.9976169|  0.990464|  2.086695|
|2019-01-01 03:00:00|Benevento| 1.229843376744186| 8.016592|  16.01909|  0.3578356|3.9868464|1.9940137|  1.008158|  1.027574|
|2019-01-01 04:00:00|Benevento| 1.174048741162791| 7.097469|  14.63753|-0.79321045|3.9885209|1.9904202| 1.0197421|0.09878564|
|2019-01-01 05:00:00|Benevento| 1.155459176434108| 6.371493| 13.753676| -1.5162152| 3.990195|1.9868367| 1.0256498|-0.6

Processing group: Salerno with 29862 data points
                                                                                

In [22]:
from pyspark.sql.functions import col
from pyspark.sql.functions import pow, sqrt
from prophet.diagnostics import performance_metrics


# Calculate MAE
mae_NO2 = spark_forecast_NO2.select(col("NOME"), abs(col("y") - col("yhat")).alias("error")) \
                    .groupBy("NOME") \
                    .agg({"error": "mean"}) \
                    .withColumnRenamed("avg(error)", "MAE")

# Calculate MSE
mse_NO2 = spark_forecast_NO2.select(col("NOME"), pow(col("y") - col("yhat"), 2).alias("error")) \
                    .groupBy("NOME") \
                    .agg({"error": "mean"}) \
                    .withColumnRenamed("avg(error)", "MSE")

# Calculate RMSE
rmse_NO2 = spark_forecast_NO2.select(col("NOME"), pow(col("y") - col("yhat"), 2).alias("error")) \
                     .groupBy("NOME") \
                     .agg({"error": "mean"}) \
                     .withColumnRenamed("avg(error)", "MSE") \
                     .withColumn("RMSE", sqrt(col("MSE")).alias("RMSE"))

# Show the results
mae_NO2.show()
mse_NO2.show()
rmse_NO2.show()

14:53:12 - cmdstanpy - INFO - Chain [1] start processing        (67 + 46) / 113]
14:53:14 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
14:53:23 - cmdstanpy - INFO - Chain [1] done processing
14:53:25 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
Processing group: Salerno with 29862 data points
14:53:35 - cmdstanpy - INFO - Chain [1] start processing
14:53:36 - cmdstanpy - INFO - Chain [1] start processing
14:53:42 - cmdstanpy - INFO - Chain [1] done processing
14:53:45 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points                    (1 + 1) / 2]
14:53:58 - cmdstanpy - INFO - Chain [1] start processing
14:54:04 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:54:14 - cmdstanpy - INFO - Chain [1] start processing
14:54:21 - cmdstanpy - INFO - Chain [1] done processing
                                              

+---------+------------------+
|     NOME|               MAE|
+---------+------------------+
|Benevento|2.8688415107425516|
|  Salerno| 1.900808450808384|
| Avellino|2.2757902227786073|
|   Napoli| 10.40138608624887|
|  Caserta| 4.200234812582728|
+---------+------------------+



Processing group: Avellino with 29862 data points==>                (5 + 2) / 7]
Processing group: Benevento with 29862 data points
14:54:36 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
14:54:36 - cmdstanpy - INFO - Chain [1] start processing
14:54:43 - cmdstanpy - INFO - Chain [1] done processing
14:54:46 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
14:54:54 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
14:54:56 - cmdstanpy - INFO - Chain [1] start processing
14:54:59 - cmdstanpy - INFO - Chain [1] done processing
14:55:05 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:55:09 - cmdstanpy - INFO - Chain [1] start processing
14:55:17 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
  fig = plt.figure(facecolor='w', figsize=figsize)
                                                         

+---------+------------------+
|     NOME|               MSE|
+---------+------------------+
|Benevento|15.538966892580591|
|  Salerno| 6.658907596624998|
| Avellino| 10.26953190984369|
|   Napoli|185.98135580787326|
|  Caserta|30.660439967329978|
+---------+------------------+



Processing group: Benevento with 29862 data points=>                (5 + 2) / 7]
Processing group: Avellino with 29862 data points
14:55:31 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
14:55:31 - cmdstanpy - INFO - Chain [1] start processing
14:55:37 - cmdstanpy - INFO - Chain [1] done processing
14:55:39 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
14:55:47 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
14:55:50 - cmdstanpy - INFO - Chain [1] start processing
14:55:55 - cmdstanpy - INFO - Chain [1] done processing
14:56:00 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:56:06 - cmdstanpy - INFO - Chain [1] start processing
14:56:13 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]


+---------+------------------+------------------+
|     NOME|               MSE|              RMSE|
+---------+------------------+------------------+
|Benevento|15.538966892580591|3.9419496309035447|
|  Salerno| 6.658907596624998| 2.580485922578342|
| Avellino| 10.26953190984369|3.2046110387757967|
|   Napoli|185.98135580787326|13.637498150609343|
|  Caserta|30.660439967329978|  5.53718700852066|
+---------+------------------+------------------+



                                                                                

In [15]:
#For Long and Short Term Forecast


from pyspark.sql.functions import *
from time import time
import matplotlib.pyplot as plt
import os

data_NO2_prophet = data_NO2.groupBy("NOME", "original_date_time").agg(avg("c_NO2").alias("avg_c_NO2"))
#data_NO2_prophet.show()

data_NO2_prophet_u = data_NO2_prophet.withColumnRenamed("avg_c_NO2", "y").withColumnRenamed("original_date_time", "ds")

data_NO2_prophet = data_NO2_prophet_u.withColumn("ds", to_timestamp("ds", "yyyy-MM-dd HH:mm:ss")) \
                                       .withColumn("ds", from_utc_timestamp("ds", "Europe/Rome"))

#data_NO2_prophet.printSchema()

def train_and_forecast(group):
    
    print(f"Processing group: {group['NOME'].iloc[0]} with {len(group)} data points")
    
    group["ds"] = pd.to_datetime(group["ds"])

    m = Prophet(seasonality_mode="additive",
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=True,
                    interval_width=0.95)
    
    m.fit(group[group['ds'] <= cutoff_date])
    
    forecast = m.predict(group)
    
    path = '/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/NO2/'

    if not os.path.exists(path):
        os.makedirs(path)
    
    #try:
     #   fig1 = m.plot(forecast)
      #  fig1.suptitle(f"NO2 Concentration Forecast for {group['NOME'].iloc[0]}")
       # fig1.legend(['Actual', 'Forecasted'], loc='upper right')
        #fig1.subplots_adjust(top=0.9)
        
        #fig2 = m.plot_components(forecast)
        #fig2.suptitle(f"NO2 Concentration Component Forecast for {group['NOME'].iloc[0]}")
        #fig2.subplots_adjust(top=0.9)

        #fig1.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet/NO2/{group['NOME'].iloc[0]}_prophet_forecast.png")
        #fig2.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet/NO2/{group['NOME'].iloc[0]}_prophet_components.png")
    #except Exception as e:
     #   print(f"Error generating plots for group {group['NOME'].iloc[0]}: {str(e)}")
    
    
    f_pd = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper','trend', 'yearly', 'weekly', 'daily']].set_index("ds")
 
    group_pd = group[['ds', 'NOME', 'y']].set_index("ds")
    
    result_pd = f_pd.join( group_pd, how = "left")
    result_pd.reset_index(level=0, inplace=True)
    
    result_pd['NOME'] = group['NOME'].iloc[0]
    
    return result_pd[['ds', 'NOME', 'y', 'yhat', 'yhat_upper', 'yhat_lower','trend', 'yearly', 'weekly', 'daily']] 

result_schema =StructType([
    StructField('ds',TimestampType()),
    StructField('NOME', StringType()),
    StructField('y', DoubleType()),
    StructField('yhat',FloatType()),
    StructField('yhat_upper',FloatType()),
    StructField('yhat_lower',FloatType()),
    StructField('trend',FloatType()),
    StructField('yearly',FloatType()),
    StructField('weekly',FloatType()),
    StructField('daily',FloatType())
  ])

# create train and test data based on cutoff date
cutoff_date = "2021-06-30 23:00:00"

data_NO2_prophet_train = data_NO2_prophet.filter(col('ds') <= cutoff_date)
data_NO2_prophet_test = data_NO2_prophet.filter((col('ds') > cutoff_date))

max_date = data_NO2_prophet_test.agg(max('ds')).collect()[0][0]

period = data_NO2_prophet_test.count()

# Start time
start_time = time()
# Train and forecast by ticker 
spark_forecast_NO2 = data_NO2_prophet.groupBy("NOME").applyInPandas(train_and_forecast, schema=result_schema)
# Processing time
print('The time used for the Spark forecast is ', time()-start_time)

#spark_forecast_NO2.show()




The time used for the Spark forecast is  0.03347587585449219


                                                                                

In [16]:
from pyspark.sql.functions import col, pow, sqrt

# Define cutoff dates
cutoff_dates = {
    "5 Days": "2021-07-05 23:00:00",
    "30 Days": "2021-07-30 23:00:00",
    "90 Days": "2021-09-28 23:00:00",
    "180 Days": "2021-12-27 23:00:00",
    "365 Days": "2022-06-29 23:00:00"
}

# Define a list to store the evaluation results
results = []

# Compute evaluation metrics for each time period and NOME
for period, cutoff in cutoff_dates.items():
    for nome in spark_forecast_NO2.select("NOME").distinct().rdd.flatMap(lambda x: x).collect():
        # Filter the forecast data based on the cutoff date and NOME
        forecast_data_NO2 = spark_forecast_NO2.filter((col("ds") <= cutoff) & (col("NOME") == nome))

        # Calculate MAE
        mae = forecast_data_NO2.select(abs(col("y") - col("yhat")).alias("error")) \
                             .agg({"error": "mean"}) \
                             .withColumnRenamed("avg(error)", "MAE") \
                             .collect()[0]["MAE"]

        # Calculate MSE
        mse = forecast_data_NO2.select(pow(col("y") - col("yhat"), 2).alias("error")) \
                             .agg({"error": "mean"}) \
                             .withColumnRenamed("avg(error)", "MSE") \
                             .collect()[0]["MSE"]

        # Calculate RMSE
        rmse = forecast_data_NO2.select(pow(col("y") - col("yhat"), 2).alias("error")) \
                              .agg({"error": "mean"}) \
                              .withColumnRenamed("avg(error)", "MSE") \
                              .withColumn("RMSE", sqrt(col("MSE")).alias("RMSE")) \
                              .collect()[0]["RMSE"]

        # Calculate coverage
        coverage = forecast_data_NO2.filter((col("y") >= col("yhat_lower")) & (col("y") <= col("yhat_upper"))) \
                                  .count() / forecast_data_NO2.count() * 100

        # Add the results to the list
        results.append((period, nome, mse, rmse, mae, coverage))

# Create a Spark DataFrame from the results list
result_df_NO2 = spark.createDataFrame(results, ["Time", "NOME", "MSE", "RMSE", "MAE", "Coverage"])

# Show the results
result_df_NO2.show(100)

Processing group: Avellino with 29862 data points                   (0 + 2) / 2]
Processing group: Benevento with 29862 data points
14:09:07 - cmdstanpy - INFO - Chain [1] start processing
14:09:07 - cmdstanpy - INFO - Chain [1] start processing
14:09:13 - cmdstanpy - INFO - Chain [1] done processing
14:09:15 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
14:09:21 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
14:09:23 - cmdstanpy - INFO - Chain [1] start processing
14:09:27 - cmdstanpy - INFO - Chain [1] done processing
14:09:31 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:09:35 - cmdstanpy - INFO - Chain [1] start processing
14:09:43 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
14:10:17 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
1

14:31:14 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
14:31:20 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
14:31:22 - cmdstanpy - INFO - Chain [1] start processing
14:31:25 - cmdstanpy - INFO - Chain [1] done processing
14:31:29 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:31:33 - cmdstanpy - INFO - Chain [1] start processing
14:31:41 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Benevento with 29862 data points
14:32:19 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
14:32:19 - cmdstanpy - INFO - Chain [1] start processing
14:32:24 - cmdstanpy - INFO - Chain [1] done processing
14:32:28 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
14:32:32 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno w

14:42:08 - cmdstanpy - INFO - Chain [1] start processing
14:42:13 - cmdstanpy - INFO - Chain [1] done processing
14:42:16 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:42:21 - cmdstanpy - INFO - Chain [1] start processing
14:42:28 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
14:43:03 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
14:43:03 - cmdstanpy - INFO - Chain [1] start processing
14:43:09 - cmdstanpy - INFO - Chain [1] done processing
14:43:11 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
14:43:17 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
14:43:19 - cmdstanpy - INFO - Chain [1] start processing
14:43:22 - cmdstanpy - INFO - Chain [1] done processing
14:43:27 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Nap

14:53:19 - cmdstanpy - INFO - Chain [1] start processing
14:53:27 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
14:54:00 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
14:54:00 - cmdstanpy - INFO - Chain [1] start processing
14:54:06 - cmdstanpy - INFO - Chain [1] done processing
14:54:08 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
14:54:14 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
14:54:16 - cmdstanpy - INFO - Chain [1] start processing
14:54:21 - cmdstanpy - INFO - Chain [1] done processing
14:54:24 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
14:54:29 - cmdstanpy - INFO - Chain [1] start processing
14:54:36 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
14

15:15:48 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
15:15:48 - cmdstanpy - INFO - Chain [1] start processing
15:15:54 - cmdstanpy - INFO - Chain [1] done processing
15:15:56 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
15:16:02 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
15:16:04 - cmdstanpy - INFO - Chain [1] start processing
15:16:08 - cmdstanpy - INFO - Chain [1] done processing
15:16:13 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
15:16:17 - cmdstanpy - INFO - Chain [1] start processing
15:16:24 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
15:17:02 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
15:17:02 - cmdstanpy - INFO - Chain [1] start processing
15:17:08 - cmdstanpy - INFO - Chain [1] done process

15:27:03 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
Processing group: Salerno with 29862 data points
15:27:10 - cmdstanpy - INFO - Chain [1] start processing
15:27:11 - cmdstanpy - INFO - Chain [1] start processing
15:27:15 - cmdstanpy - INFO - Chain [1] done processing
15:27:19 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
15:27:23 - cmdstanpy - INFO - Chain [1] start processing
15:27:31 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
15:28:09 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
15:28:09 - cmdstanpy - INFO - Chain [1] start processing
15:28:14 - cmdstanpy - INFO - Chain [1] done processing
15:28:18 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
15:28:22 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno wi

15:37:59 - cmdstanpy - INFO - Chain [1] start processing
15:38:06 - cmdstanpy - INFO - Chain [1] done processing
15:38:09 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
15:38:14 - cmdstanpy - INFO - Chain [1] start processing
15:38:21 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
15:38:59 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
15:38:59 - cmdstanpy - INFO - Chain [1] start processing
15:39:04 - cmdstanpy - INFO - Chain [1] done processing
15:39:07 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
15:39:12 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
15:39:15 - cmdstanpy - INFO - Chain [1] start processing
15:39:18 - cmdstanpy - INFO - Chain [1] done processing
15:39:23 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Nap

15:49:13 - cmdstanpy - INFO - Chain [1] start processing
15:49:21 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
15:49:58 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
15:49:58 - cmdstanpy - INFO - Chain [1] start processing
15:50:03 - cmdstanpy - INFO - Chain [1] done processing
15:50:05 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
15:50:11 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
15:50:12 - cmdstanpy - INFO - Chain [1] start processing
15:50:18 - cmdstanpy - INFO - Chain [1] done processing
15:50:21 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
15:50:26 - cmdstanpy - INFO - Chain [1] start processing
15:50:33 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
15

16:11:55 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
16:11:55 - cmdstanpy - INFO - Chain [1] start processing
16:12:01 - cmdstanpy - INFO - Chain [1] done processing
16:12:04 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
16:12:09 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
16:12:12 - cmdstanpy - INFO - Chain [1] start processing
16:12:16 - cmdstanpy - INFO - Chain [1] done processing
16:12:20 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
16:12:24 - cmdstanpy - INFO - Chain [1] start processing
16:12:31 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
16:13:10 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
16:13:10 - cmdstanpy - INFO - Chain [1] start processing
16:13:16 - cmdstanpy - INFO - Chain [1] done process

16:22:59 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
16:23:04 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
16:23:07 - cmdstanpy - INFO - Chain [1] start processing
16:23:11 - cmdstanpy - INFO - Chain [1] done processing
16:23:15 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
16:23:19 - cmdstanpy - INFO - Chain [1] start processing
16:23:27 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
16:24:05 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
16:24:05 - cmdstanpy - INFO - Chain [1] start processing
16:24:10 - cmdstanpy - INFO - Chain [1] done processing
16:24:11 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
16:24:17 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno wi

16:34:07 - cmdstanpy - INFO - Chain [1] start processing
16:34:12 - cmdstanpy - INFO - Chain [1] done processing
16:34:16 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
16:34:20 - cmdstanpy - INFO - Chain [1] start processing
16:34:28 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points
16:35:05 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
16:35:05 - cmdstanpy - INFO - Chain [1] start processing
16:35:11 - cmdstanpy - INFO - Chain [1] done processing
16:35:14 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
16:35:19 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
16:35:22 - cmdstanpy - INFO - Chain [1] start processing
16:35:25 - cmdstanpy - INFO - Chain [1] done processing
16:35:29 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Nap

16:45:08 - cmdstanpy - INFO - Chain [1] start processing
16:45:16 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Benevento with 29862 data points
16:45:52 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
16:45:52 - cmdstanpy - INFO - Chain [1] start processing
16:45:58 - cmdstanpy - INFO - Chain [1] done processing
16:46:00 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
16:46:06 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
16:46:08 - cmdstanpy - INFO - Chain [1] start processing
16:46:14 - cmdstanpy - INFO - Chain [1] done processing
16:46:17 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
16:46:22 - cmdstanpy - INFO - Chain [1] start processing
16:46:29 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
                                                   

+--------+---------+------------------+------------------+------------------+-----------------+
|    Time|     NOME|               MSE|              RMSE|               MAE|         Coverage|
+--------+---------+------------------+------------------+------------------+-----------------+
|  5 Days|Benevento|15.883797523857158|  3.98544822119886|2.7857546306761476|94.78544776119404|
|  5 Days|  Salerno| 6.468074292705023|2.5432409033957093|1.8025959243476621|93.96455223880596|
|  5 Days| Avellino|10.631426313586132|3.2605868050990656| 2.197703992235896| 94.6035447761194|
|  5 Days|   Napoli|146.63395822540443|12.109250935768257| 9.102701454176271|93.51212686567165|
|  5 Days|  Caserta| 30.39538727545387|5.5132011822038445| 4.069779938903085| 94.2723880597015|
| 30 Days|Benevento|15.676834384240577| 3.959398235116111|2.7743112200776254|94.93194192377496|
| 30 Days|  Salerno| 6.452182863755967|2.5401147343685024| 1.805778253650868|94.00635208711434|
| 30 Days| Avellino| 10.47240669850752|3

In [None]:
result_df_NO2_pd = result_df_NO2.pandas_api()
result_df_NO2_pd

O3 

In [23]:
from pyspark.sql.functions import *
from time import time
import matplotlib.pyplot as plt
import os

data_O3_prophet = data_O3.groupBy("NOME", "original_date_time").agg(avg("c_O3").alias("avg_c_O3"))
data_O3_prophet.show()

data_O3_prophet_u = data_O3_prophet.withColumnRenamed("avg_c_O3", "y").withColumnRenamed("original_date_time", "ds")

data_O3_prophet = data_O3_prophet_u.withColumn("ds", to_timestamp("ds", "yyyy-MM-dd HH:mm:ss")) \
                                       .withColumn("ds", from_utc_timestamp("ds", "Europe/Rome"))

data_O3_prophet.printSchema()

def train_and_forecast(group):
    
    print(f"Processing group: {group['NOME'].iloc[0]} with {len(group)} data points")
    
    group["ds"] = pd.to_datetime(group["ds"])

    m = Prophet(seasonality_mode="additive",
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=True,
                    interval_width=0.95)
    
    m.fit(group[group['ds'] <= cutoff_date])
    
    forecast = m.predict(group)
    
    path = '/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/O3/'

    if not os.path.exists(path):
        os.makedirs(path)
    
    try:
        fig1 = m.plot(forecast)
        fig1.suptitle(f"O3 Concentration Forecast for {group['NOME'].iloc[0]}")
        fig1.legend(['Actual', 'Forecasted'], loc='upper right')
        fig1.subplots_adjust(top=0.9)
        
        fig2 = m.plot_components(forecast)
        fig2.suptitle(f"O3 Concentration Component Forecast for {group['NOME'].iloc[0]}")
        fig2.subplots_adjust(top=0.9)

        fig1.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/O3/{group['NOME'].iloc[0]}_prophet_forecast.png")
        fig2.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/O3/{group['NOME'].iloc[0]}_prophet_components.png")
    except Exception as e:
        print(f"Error generating plots for group {group['NOME'].iloc[0]}: {str(e)}")
    
    
    f_pd = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper','trend', 'yearly', 'weekly', 'daily']].set_index("ds")
 
    group_pd = group[['ds', 'NOME', 'y']].set_index("ds")
    
    result_pd = f_pd.join( group_pd, how = "left")
    result_pd.reset_index(level=0, inplace=True)
    
    result_pd['NOME'] = group['NOME'].iloc[0]
    
    return result_pd[['ds', 'NOME', 'y', 'yhat', 'yhat_upper', 'yhat_lower','trend', 'yearly', 'weekly', 'daily']] 

result_schema =StructType([
    StructField('ds',TimestampType()),
    StructField('NOME', StringType()),
    StructField('y', DoubleType()),
    StructField('yhat',FloatType()),
    StructField('yhat_upper',FloatType()),
    StructField('yhat_lower',FloatType()),
    StructField('trend',FloatType()),
    StructField('yearly',FloatType()),
    StructField('weekly',FloatType()),
    StructField('daily',FloatType())
  ])


# create train and test data based on cutoff date
cutoff_date = "2021-06-30 23:00:00"

data_O3_prophet_train = data_O3_prophet.filter(col('ds') <= cutoff_date)
data_O3_prophet_test = data_O3_prophet.filter((col('ds') > cutoff_date))

max_date = data_O3_prophet_test.agg(max('ds')).collect()[0][0]

period = data_O3_prophet_test.count()

# Start time
start_time = time()
# Train and forecast by ticker 
spark_forecast_O3 = data_O3_prophet.groupBy("NOME").applyInPandas(train_and_forecast, schema=result_schema)
# Processing time
print('The time used for the Spark forecast is ', time()-start_time)

spark_forecast_O3.show()


                                                                                

+---------+-------------------+------------------+
|     NOME| original_date_time|          avg_c_O3|
+---------+-------------------+------------------+
|   Napoli|2022-01-24 05:00:00|61.108973219178104|
|  Salerno|2022-01-24 05:00:00|58.490383118589726|
| Avellino|2021-05-21 15:00:00|120.98329703468211|
|  Caserta|2019-01-24 20:00:00|48.119171763636366|
| Avellino|2019-01-24 01:00:00| 53.53818496531796|
|  Salerno|2021-11-01 00:00:00| 71.46073684615389|
|  Salerno|2019-02-01 19:00:00|57.773951512820545|
|Benevento|2022-01-08 13:00:00| 62.87799679844961|
|   Napoli|2021-06-11 17:00:00|114.55085634246574|
|   Napoli|2021-12-06 16:00:00| 46.85498116438357|
|  Salerno|2021-12-30 20:00:00| 55.20029793269234|
| Avellino|2019-11-03 04:00:00| 71.69187458959537|
|  Caserta|2019-02-01 05:00:00| 49.26924827272726|
| Avellino|2021-11-01 02:00:00| 62.49715756647398|
|Benevento|2020-11-21 02:00:00| 51.80687717829457|
|Benevento|2020-11-21 06:00:00|52.173485038759694|
| Avellino|2021-12-30 12:00:00|

                                                                                

The time used for the Spark forecast is  0.0359349250793457


14:56:33 - cmdstanpy - INFO - Chain [1] start processing            (0 + 1) / 1]
14:56:43 - cmdstanpy - INFO - Chain [1] done processing


+-------------------+---------+------------------+---------+----------+----------+---------+----------+----------+------------+
|                 ds|     NOME|                 y|     yhat|yhat_upper|yhat_lower|    trend|    yearly|    weekly|       daily|
+-------------------+---------+------------------+---------+----------+----------+---------+----------+----------+------------+
|2019-01-01 01:00:00|Benevento| 60.72303379069767|37.707584| 58.719906| 17.391811| 71.69005|-23.926624|-1.5209005|   -8.534935|
|2019-01-01 02:00:00|Benevento| 60.46292085271317|38.205368|   58.1786| 17.201944|  71.6827|-23.932486|-1.5605963|   -7.984248|
|2019-01-01 03:00:00|Benevento|59.605359007751936|38.686245| 59.886116|    18.397|71.675354|-23.938345|-1.5957298|   -7.455035|
|2019-01-01 04:00:00|Benevento|58.356536356589125|38.916092| 58.890896| 18.012447| 71.66801|-23.944199|    -1.626|  -7.1817174|
|2019-01-01 05:00:00|Benevento| 57.46576033333333| 38.90148| 58.458042|  18.00571| 71.66066| -23.95005|-

Processing group: Salerno with 29862 data points
                                                                                

In [25]:
from pyspark.sql.functions import col
from pyspark.sql.functions import pow, sqrt
from prophet.diagnostics import performance_metrics


# Calculate MAE
mae_O3 = spark_forecast_O3.select(col("NOME"), abs(col("y") - col("yhat")).alias("error")) \
                    .groupBy("NOME") \
                    .agg({"error": "mean"}) \
                    .withColumnRenamed("avg(error)", "MAE")


# Calculate MSE
mse_O3 = spark_forecast_O3.select(col("NOME"), pow(col("y") - col("yhat"), 2).alias("error")) \
                    .groupBy("NOME") \
                    .agg({"error": "mean"}) \
                    .withColumnRenamed("avg(error)", "MSE")

# Calculate RMSE
rmse_O3 = spark_forecast_O3.select(col("NOME"), pow(col("y") - col("yhat"), 2).alias("error")) \
                     .groupBy("NOME") \
                     .agg({"error": "mean"}) \
                     .withColumnRenamed("avg(error)", "MSE") \
                     .withColumn("RMSE", sqrt(col("MSE")).alias("RMSE"))

# Show the results
mae_O3.show()
mse_O3.show()
rmse_O3.show()

Processing group: Avellino with 29862 data points                   (0 + 2) / 2]
Processing group: Benevento with 29862 data points
15:14:45 - cmdstanpy - INFO - Chain [1] start processing
15:14:45 - cmdstanpy - INFO - Chain [1] start processing
15:14:56 - cmdstanpy - INFO - Chain [1] done processing
15:14:57 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
15:15:13 - cmdstanpy - INFO - Chain [1] start processing
15:15:13 - cmdstanpy - INFO - Chain [1] start processing
15:15:22 - cmdstanpy - INFO - Chain [1] done processing
15:15:23 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (0 + 2) / 2]
15:15:32 - cmdstanpy - INFO - Chain [1] start processing
15:15:40 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
                                                                                

+---------+------------------+
|     NOME|               MAE|
+---------+------------------+
|Benevento| 9.017186850638417|
|  Salerno| 7.743955757389471|
| Avellino| 8.650666892810463|
|   Napoli|12.753057089825983|
|  Caserta|  9.95513096688605|
+---------+------------------+



Processing group: Benevento with 29862 data points====>         (93 + 20) / 113]
Processing group: Avellino with 29862 data points
15:15:53 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
15:15:53 - cmdstanpy - INFO - Chain [1] start processing
15:16:03 - cmdstanpy - INFO - Chain [1] done processing
15:16:04 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
15:16:13 - cmdstanpy - INFO - Chain [1] start processing
15:16:14 - cmdstanpy - INFO - Chain [1] start processing
15:16:25 - cmdstanpy - INFO - Chain [1] done processing
15:16:26 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
15:16:34 - cmdstanpy - INFO - Chain [1] start processing
15:16:43 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
                                                                                

+---------+------------------+
|     NOME|               MSE|
+---------+------------------+
|Benevento|139.62034911169712|
|  Salerno| 99.46340991568502|
| Avellino|126.06034415777752|
|   Napoli|260.37679754097036|
|  Caserta| 166.0431161258783|
+---------+------------------+



Processing group: Avellino with 29862 data points
15:16:55 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
15:16:55 - cmdstanpy - INFO - Chain [1] start processing
15:17:08 - cmdstanpy - INFO - Chain [1] done processing
15:17:09 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
15:17:18 - cmdstanpy - INFO - Chain [1] start processing
15:17:18 - cmdstanpy - INFO - Chain [1] start processing
15:17:29 - cmdstanpy - INFO - Chain [1] done processing
15:17:31 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
15:17:40 - cmdstanpy - INFO - Chain [1] start processing
15:17:48 - cmdstanpy - INFO - Chain [1] done processing


+---------+------------------+------------------+
|     NOME|               MSE|              RMSE|
+---------+------------------+------------------+
|Benevento|139.62034911169712|11.816105496808039|
|  Salerno| 99.46340991568502|  9.97313440778199|
| Avellino|126.06034415777752|11.227659780995214|
|   Napoli|260.37679754097036| 16.13619526223485|
|  Caserta| 166.0431161258783|12.885771848278173|
+---------+------------------+------------------+



                                                                                

In [7]:
#For Long and Short Term Forecast

from pyspark.sql.functions import *
from time import time
import matplotlib.pyplot as plt
import os

data_O3_prophet = data_O3.groupBy("NOME", "original_date_time").agg(avg("c_O3").alias("avg_c_O3"))
#data_O3_prophet.show()

data_O3_prophet_u = data_O3_prophet.withColumnRenamed("avg_c_O3", "y").withColumnRenamed("original_date_time", "ds")

data_O3_prophet = data_O3_prophet_u.withColumn("ds", to_timestamp("ds", "yyyy-MM-dd HH:mm:ss")) \
                                       .withColumn("ds", from_utc_timestamp("ds", "Europe/Rome"))

#data_O3_prophet.printSchema()

def train_and_forecast(group):
    
    print(f"Processing group: {group['NOME'].iloc[0]} with {len(group)} data points")
    
    group["ds"] = pd.to_datetime(group["ds"])

    m = Prophet(seasonality_mode="additive",
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=True,
                    interval_width=0.95)
    
    m.fit(group[group['ds'] <= cutoff_date])
    
    forecast = m.predict(group)
    
    path = '/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet_campania/O3/'

    if not os.path.exists(path):
        os.makedirs(path)
    
   # try:
    #    fig1 = m.plot(forecast)
     #   fig1.suptitle(f"O3 Concentration Forecast for {group['NOME'].iloc[0]}")
     #   fig1.legend(['Actual', 'Forecasted'], loc='upper right')
      #  fig1.subplots_adjust(top=0.9)
        
       # fig2 = m.plot_components(forecast)
       # fig2.suptitle(f"O3 Concentration Component Forecast for {group['NOME'].iloc[0]}")
        #fig2.subplots_adjust(top=0.9)

       # fig1.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet/O3/{group['NOME'].iloc[0]}_prophet_forecast.png")
        #fig2.savefig(f"/afs/enea.it/por/user/nafis/PFS/por/Thesis/codes/prophet/O3/{group['NOME'].iloc[0]}_prophet_components.png")
    #except Exception as e:
     #   print(f"Error generating plots for group {group['NOME'].iloc[0]}: {str(e)}")
    
    
    f_pd = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper','trend', 'yearly', 'weekly', 'daily']].set_index("ds")
 
    group_pd = group[['ds', 'NOME', 'y']].set_index("ds")
    
    result_pd = f_pd.join( group_pd, how = "left")
    result_pd.reset_index(level=0, inplace=True)
    
    result_pd['NOME'] = group['NOME'].iloc[0]
    
    return result_pd[['ds', 'NOME', 'y', 'yhat', 'yhat_upper', 'yhat_lower','trend', 'yearly', 'weekly', 'daily']] 

result_schema =StructType([
    StructField('ds',TimestampType()),
    StructField('NOME', StringType()),
    StructField('y', DoubleType()),
    StructField('yhat',FloatType()),
    StructField('yhat_upper',FloatType()),
    StructField('yhat_lower',FloatType()),
    StructField('trend',FloatType()),
    StructField('yearly',FloatType()),
    StructField('weekly',FloatType()),
    StructField('daily',FloatType())
  ])


# create train and test data based on cutoff date
cutoff_date = "2021-06-30 23:00:00"

data_O3_prophet_train = data_O3_prophet.filter(col('ds') <= cutoff_date)
data_O3_prophet_test = data_O3_prophet.filter((col('ds') > cutoff_date))

max_date = data_O3_prophet_test.agg(max('ds')).collect()[0][0]

period = data_O3_prophet_test.count()

# Start time
start_time = time()
# Train and forecast by ticker 
spark_forecast_O3 = data_O3_prophet.groupBy("NOME").applyInPandas(train_and_forecast, schema=result_schema)
# Processing time
print('The time used for the Spark forecast is ', time()-start_time)

#spark_forecast_O3.show()


                                                                                

The time used for the Spark forecast is  0.45984888076782227


In [8]:
from pyspark.sql.functions import col, pow, sqrt

# Define cutoff dates
cutoff_dates = {
    "5 Days": "2021-07-05 23:00:00",
    "30 Days": "2021-07-30 23:00:00",
    "90 Days": "2021-09-28 23:00:00",
    "180 Days": "2021-12-27 23:00:00",
    "365 Days": "2022-06-29 23:00:00"
}

# Define a list to store the evaluation results
results = []

# Compute evaluation metrics for each time period and NOME
for period, cutoff in cutoff_dates.items():
    for nome in spark_forecast_O3.select("NOME").distinct().rdd.flatMap(lambda x: x).collect():
        # Filter the forecast data based on the cutoff date and NOME
        forecast_data_O3 = spark_forecast_O3.filter((col("ds") <= cutoff) & (col("NOME") == nome))

        # Calculate MAE
        mae = forecast_data_O3.select(abs(col("y") - col("yhat")).alias("error")) \
                             .agg({"error": "mean"}) \
                             .withColumnRenamed("avg(error)", "MAE") \
                             .collect()[0]["MAE"]

        # Calculate MSE
        mse = forecast_data_O3.select(pow(col("y") - col("yhat"), 2).alias("error")) \
                             .agg({"error": "mean"}) \
                             .withColumnRenamed("avg(error)", "MSE") \
                             .collect()[0]["MSE"]

        # Calculate RMSE
        rmse = forecast_data_O3.select(pow(col("y") - col("yhat"), 2).alias("error")) \
                              .agg({"error": "mean"}) \
                              .withColumnRenamed("avg(error)", "MSE") \
                              .withColumn("RMSE", sqrt(col("MSE")).alias("RMSE")) \
                              .collect()[0]["RMSE"]

        # Calculate coverage
        coverage = forecast_data_O3.filter((col("y") >= col("yhat_lower")) & (col("y") <= col("yhat_upper"))) \
                                  .count() / forecast_data_O3.count() * 100

        # Add the results to the list
        results.append((period, nome, mse, rmse, mae, coverage))

# Create a Spark DataFrame from the results list
result_df_O3 = spark.createDataFrame(results, ["Time", "NOME", "MSE", "RMSE", "MAE", "Coverage"])

# Show the results
result_df_O3.show(100)


Processing group: Avellino with 29862 data points                   (0 + 2) / 2]
Processing group: Benevento with 29862 data points
22:42:07 - cmdstanpy - INFO - Chain [1] start processing
22:42:07 - cmdstanpy - INFO - Chain [1] start processing
22:42:20 - cmdstanpy - INFO - Chain [1] done processing
22:42:23 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
22:42:29 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
22:42:31 - cmdstanpy - INFO - Chain [1] start processing
22:42:42 - cmdstanpy - INFO - Chain [1] done processing
22:42:44 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
22:42:50 - cmdstanpy - INFO - Chain [1] start processing
22:43:00 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Benevento with 29862 data points                  (0 + 2) / 2]
Processing group: Avellino with 29862 data points
2

23:04:18 - cmdstanpy - INFO - Chain [1] done processing
23:04:20 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
23:04:26 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
23:04:28 - cmdstanpy - INFO - Chain [1] start processing
23:04:39 - cmdstanpy - INFO - Chain [1] done processing
23:04:41 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
23:04:49 - cmdstanpy - INFO - Chain [1] start processing
23:04:57 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points                   (0 + 2) / 2]
Processing group: Benevento with 29862 data points
23:07:18 - cmdstanpy - INFO - Chain [1] start processing
23:07:18 - cmdstanpy - INFO - Chain [1] start processing
23:07:29 - cmdstanpy - INFO - Chain [1] done processing
23:07:30 - cmdstanpy - INFO - Chain [1] done processing
Processing g

Processing group: Caserta with 29862 data points
23:25:04 - cmdstanpy - INFO - Chain [1] start processing
23:25:14 - cmdstanpy - INFO - Chain [1] done processing
23:25:16 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
23:25:25 - cmdstanpy - INFO - Chain [1] start processing
23:25:32 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points                  (0 + 2) / 2]
Processing group: Avellino with 29862 data points
23:28:25 - cmdstanpy - INFO - Chain [1] start processing
23:28:25 - cmdstanpy - INFO - Chain [1] start processing
23:28:38 - cmdstanpy - INFO - Chain [1] done processing
23:28:41 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
23:28:46 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
23:28:50 - cmdstanpy - INFO - Chain [1] start processing
23:28:59 - cmdstanp

23:49:21 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
23:49:39 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points=>                (5 + 2) / 7]
Processing group: Avellino with 29862 data points
23:50:18 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
23:50:18 - cmdstanpy - INFO - Chain [1] start processing
23:50:31 - cmdstanpy - INFO - Chain [1] done processing
23:50:32 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
23:50:39 - cmdstanpy - INFO - Chain [1] start processing
23:50:41 - cmdstanpy - INFO - Chain [1] start processing
23:50:54 - cmdstanpy - INFO - Chain [1] done processing
23:50:55 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
23:51:02 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
23:51:11 - cmdstanpy - INFO -

00:30:32 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
00:30:40 - cmdstanpy - INFO - Chain [1] start processing
00:30:42 - cmdstanpy - INFO - Chain [1] start processing
00:30:56 - cmdstanpy - INFO - Chain [1] done processing
00:30:58 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
00:31:05 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
00:31:16 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points
00:31:49 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
00:31:49 - cmdstanpy - INFO - Chain [1] start processing
00:32:02 - cmdstanpy - INFO - Chain [1] done processing
00:32:04 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
00:32:10 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta wi

Processing group: Napoli with 29862 data points
00:55:19 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
00:55:28 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points                  (0 + 2) / 2]
Processing group: Avellino with 29862 data points
00:57:16 - cmdstanpy - INFO - Chain [1] start processing
00:57:16 - cmdstanpy - INFO - Chain [1] start processing
00:57:29 - cmdstanpy - INFO - Chain [1] done processing
00:57:32 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
00:57:38 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
00:57:40 - cmdstanpy - INFO - Chain [1] start processing
00:57:50 - cmdstanpy - INFO - Chain [1] done processing
00:57:51 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
00:57:59 - cmdstanpy - INFO - Chain [1] start process

01:30:57 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
01:30:57 - cmdstanpy - INFO - Chain [1] start processing
01:31:09 - cmdstanpy - INFO - Chain [1] done processing
01:31:09 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
01:31:17 - cmdstanpy - INFO - Chain [1] start processing
01:31:17 - cmdstanpy - INFO - Chain [1] start processing
01:31:31 - cmdstanpy - INFO - Chain [1] done processing
01:31:31 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
01:31:41 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
01:32:07 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
01:32:45 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
01:32:45 - cmdstanpy - INFO - Chain [1] start processing
01:32:54 - cmdstanpy - INFO - Chain [1] done processi

Processing group: Salerno with 29862 data points
01:51:48 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
01:51:50 - cmdstanpy - INFO - Chain [1] start processing
01:52:02 - cmdstanpy - INFO - Chain [1] done processing
01:52:03 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
01:52:12 - cmdstanpy - INFO - Chain [1] start processing
01:52:20 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Benevento with 29862 data points
01:52:36 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
01:52:36 - cmdstanpy - INFO - Chain [1] start processing
01:52:48 - cmdstanpy - INFO - Chain [1] done processing
01:52:50 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
01:52:59 - cmdstanpy - INFO - Chain [1] start processing
01:53:01 - cmdstanpy - INF

02:10:02 - cmdstanpy - INFO - Chain [1] start processing
02:10:12 - cmdstanpy - INFO - Chain [1] done processing
02:10:14 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points                     (0 + 2) / 2]
02:10:21 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
02:10:30 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points
02:10:50 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
02:10:50 - cmdstanpy - INFO - Chain [1] start processing
02:11:02 - cmdstanpy - INFO - Chain [1] done processing
02:11:04 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
Processing group: Caserta with 29862 data points
02:11:11 - cmdstanpy - INFO - Chain [1] start processing
02:11:12 - cmdstanpy - INFO - Chain [1] start processing
02:11:25 - cmdstanpy - INFO - Chain [1] done processing
02:11:25 - cmdstanpy - INFO - Chain [1] done 

Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
02:25:48 - cmdstanpy - INFO - Chain [1] start processing
02:25:57 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Avellino with 29862 data points                   (0 + 2) / 2]
Processing group: Benevento with 29862 data points
02:28:23 - cmdstanpy - INFO - Chain [1] start processing
02:28:23 - cmdstanpy - INFO - Chain [1] start processing
02:28:35 - cmdstanpy - INFO - Chain [1] done processing
02:28:38 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Salerno with 29862 data points
02:28:43 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
02:28:46 - cmdstanpy - INFO - Chain [1] start processing
02:28:54 - cmdstanpy - INFO - Chain [1] done processing
02:28:59 - cmdstanpy - INFO - Chain [1] done processing             (0 + 2) / 2]
Processing group: Napoli with 29862 data points                     (1 + 1) / 2]
02:29:07 - cmdstan

Processing group: Avellino with 29862 data points
02:44:05 - cmdstanpy - INFO - Chain [1] start processing            (0 + 2) / 2]
02:44:05 - cmdstanpy - INFO - Chain [1] start processing
02:44:15 - cmdstanpy - INFO - Chain [1] done processing
02:44:17 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Caserta with 29862 data points
02:44:23 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Salerno with 29862 data points
02:44:26 - cmdstanpy - INFO - Chain [1] start processing
02:44:35 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
02:44:45 - cmdstanpy - INFO - Chain [1] start processing
02:44:54 - cmdstanpy - INFO - Chain [1] done processing
02:45:01 - cmdstanpy - INFO - Chain [1] done processing             (1 + 1) / 2]
Processing group: Avellino with 29862 data points===>               (5 + 2) / 7]
Processing group: Benevento with 29862 data points
02:45:38 - cmdstanpy - INFO - Chain [1] start processing  

Processing group: Salerno with 29862 data points
03:23:10 - cmdstanpy - INFO - Chain [1] start processing
Processing group: Caserta with 29862 data points
03:23:15 - cmdstanpy - INFO - Chain [1] start processing
03:23:39 - cmdstanpy - INFO - Chain [1] done processing
03:23:39 - cmdstanpy - INFO - Chain [1] done processing
Processing group: Napoli with 29862 data points
03:23:47 - cmdstanpy - INFO - Chain [1] start processing            (1 + 1) / 2]
03:23:58 - cmdstanpy - INFO - Chain [1] done processing
                                                                                

+--------+---------+------------------+------------------+------------------+-----------------+
|    Time|     NOME|               MSE|              RMSE|               MAE|         Coverage|
+--------+---------+------------------+------------------+------------------+-----------------+
|  5 Days|Benevento|111.49921207060365|10.559318731367268| 8.038409436404741| 95.0839552238806|
|  5 Days|  Salerno| 76.41245686159458| 8.741421901589842| 6.888117479069134|94.83208955223881|
|  5 Days| Avellino|102.54056883916843|10.126231719606679| 7.846251306090871|94.71548507462687|
|  5 Days|   Napoli|213.59893091249313|14.615024150253504|11.477565958623332|94.65951492537313|
|  5 Days|  Caserta|  115.858126940655|10.763741307772824| 8.402822909684414|95.01865671641792|
| 30 Days|Benevento|118.05055569997076|10.865107256717293| 8.233870490527094| 94.4010889292196|
| 30 Days|  Salerno| 80.15540987314985| 8.952955370890098| 7.017278227831938|94.25589836660617|
| 30 Days| Avellino|110.96821968637022|1

In [9]:
spark.stop()