In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyproj

# Create a SparkSession
spark = SparkSession.builder.appName("PM25").config("spark.driver.memory", "16g").config("spark.executor.memory", "8g").config("spark.task.maxFailures", "10").config("spark.executor.instances", "16").config("spark.driver.localDir", "/afs/enea.it/por/user/nafis/PFS/tmp").config("spark.executor.localDir", "/afs/enea.it/por/user/nafis/PFS/tmp").config("spark.local.dir", "/afs/enea.it/por/user/nafis/PFS/tmp").getOrCreate()


# Load the data from the CSV file

schema = StructType(
   [StructField('x', DoubleType(), True),
    StructField('y', DoubleType(), True),
    StructField('z', DoubleType(), True),
    StructField('time', TimestampType(), True),
    StructField('c_PM25', DoubleType(), True),
    StructField('c_PM10', DoubleType(), True),
    StructField('c_O3', DoubleType(), True),
    StructField('c_NO2', DoubleType(), True),
   ]
  )

df = spark.read.format("csv").load("../dataset/csv/", header=True, schema=schema)


# Select the columns in the desired order
df = df.select( 'x', 'y', 'z', 'time', 'c_PM25', 'c_PM10', 'c_O3', 'c_NO2')

#df.show()

df = df.withColumn("x", df["x"].cast(DoubleType())) \
    .withColumn("y", df["y"].cast(DoubleType())) \
    .withColumn("z", df["z"].cast(DoubleType())) \
    .withColumn("time", df["time"].cast(TimestampType())) \
    .withColumn("c_PM25", df["c_PM25"].cast(DoubleType())) \
    .withColumn("c_PM10", df["c_PM10"].cast(DoubleType())) \
    .withColumn("c_O3", df["c_O3"].cast(DoubleType())) \
    .withColumn("c_NO2", df["c_NO2"].cast(DoubleType())) \

df.printSchema()

df = df.withColumnRenamed("c_PM25", "c_PM2_5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/07 18:58:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/07 18:58:51 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/05/07 18:58:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
root
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- c_PM25: double (nullable = true)
 |-- c_PM10: double (nullable = true)
 |-- c_O3: double (nullable = true)
 |-- c_NO2: double (nullable = true)



In [2]:
#Lombardia

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyproj

# Create a SparkSession
spark = SparkSession.builder.appName("PM25").config('spark.sql.shuffle.partitions',500).config('spark.driver.maxResultSize', '10G') .config("spark.driver.memory", "32g").config("spark.executor.memory", "16g").config("spark.task.maxFailures", "10").config("spark.executor.instances", "16").config("spark.local.dir", "/afs/enea.it/por/user/nafis/PFS/tmp").getOrCreate()


# Load the data from the CSV file

schema = StructType(
   [StructField('x', DoubleType(), True),
    StructField('y', DoubleType(), True),
    StructField('z', DoubleType(), True),
    StructField('time', TimestampType(), True),
    StructField('c_PM25', DoubleType(), True),
    StructField('c_PM10', DoubleType(), True),
    StructField('c_O3', DoubleType(), True),
    StructField('c_NO2', DoubleType(), True),
    StructField('geometry', StringType(), True),
    StructField('index_right', IntegerType(), True),
    StructField('COD_RIP', IntegerType(), True),
    StructField('COD_REG', IntegerType(), True),
    StructField('DEN_REG', StringType(), True),
    StructField('Shape_Leng', DoubleType(), True),
    StructField('Shape_Area', DoubleType(), True),
   ]
  )

df = spark.read.format("csv").load("/afs/enea.it/por/user/nafis/PFS/tmp/nafi/data_chunk/lombardia/", header=True, schema=schema)

# Select the columns in the desired order
df = df.select( 'x', 'y', 'z', 'time', 'c_PM25', 'c_PM10', 'c_O3', 'c_NO2', 'geometry', 'DEN_REG')


df = df.withColumn("x", df["x"].cast(DoubleType())) \
    .withColumn("y", df["y"].cast(DoubleType())) \
    .withColumn("z", df["z"].cast(DoubleType())) \
    .withColumn("time", df["time"].cast(TimestampType())) \
    .withColumn("c_PM25", df["c_PM25"].cast(DoubleType())) \
    .withColumn("c_PM10", df["c_PM10"].cast(DoubleType())) \
    .withColumn("c_O3", df["c_O3"].cast(DoubleType())) \
    .withColumn("c_NO2", df["c_NO2"].cast(DoubleType())) \
    .withColumn("geometry", df["geometry"].cast(StringType()))\
    .withColumn("DEN_REG", df["DEN_REG"].cast(StringType()))

df = df.withColumnRenamed("c_PM25", "c_PM2_5")

df.show()

df.printSchema()

+--------+---------+----+-------------------+------------------+------------------+------------------+------------------+--------------------+---------+
|       x|        y|   z|               time|           c_PM2_5|            c_PM10|              c_O3|             c_NO2|            geometry|  DEN_REG|
+--------+---------+----+-------------------+------------------+------------------+------------------+------------------+--------------------+---------+
|462000.0|5016000.0|20.0|2019-11-24 00:00:00|         1.6957362|         2.7321966|          50.69704| 4.208417400000001|POINT (462000 501...|Lombardia|
|462000.0|5016000.0|20.0|2019-11-24 01:00:00|1.5195319999999999|         2.5212698|         52.217495|3.6308307999999996|POINT (462000 501...|Lombardia|
|462000.0|5016000.0|20.0|2019-11-24 02:00:00|1.4698639999999998|          2.540143|53.528316000000004|         3.0628326|POINT (462000 501...|Lombardia|
|462000.0|5016000.0|20.0|2019-11-24 03:00:00|         1.4075043|         2.5400522

In [1]:
#Lazio

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyproj

# Create a SparkSession
spark = SparkSession.builder.appName("PM25").config('spark.sql.shuffle.partitions',500).config('spark.driver.maxResultSize', '10G') .config("spark.driver.memory", "32g").config("spark.executor.memory", "16g").config("spark.task.maxFailures", "10").config("spark.executor.instances", "16").config("spark.local.dir", "/afs/enea.it/por/user/nafis/PFS/tmp").getOrCreate()


# Load the data from the CSV file

schema = StructType(
   [StructField('x', DoubleType(), True),
    StructField('y', DoubleType(), True),
    StructField('z', DoubleType(), True),
    StructField('time', TimestampType(), True),
    StructField('c_PM25', DoubleType(), True),
    StructField('c_PM10', DoubleType(), True),
    StructField('c_O3', DoubleType(), True),
    StructField('c_NO2', DoubleType(), True),
    StructField('geometry', StringType(), True),
    StructField('index_right', IntegerType(), True),
    StructField('COD_RIP', IntegerType(), True),
    StructField('COD_REG', IntegerType(), True),
    StructField('DEN_REG', StringType(), True),
    StructField('Shape_Leng', DoubleType(), True),
    StructField('Shape_Area', DoubleType(), True),
   ]
  )

df = spark.read.format("csv").load("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/data_chunk/lazio/", header=True, schema=schema)

# Select the columns in the desired order
df = df.select( 'x', 'y', 'z', 'time', 'c_PM25', 'c_PM10', 'c_O3', 'c_NO2', 'geometry', 'DEN_REG')


df = df.withColumn("x", df["x"].cast(DoubleType())) \
    .withColumn("y", df["y"].cast(DoubleType())) \
    .withColumn("z", df["z"].cast(DoubleType())) \
    .withColumn("time", df["time"].cast(TimestampType())) \
    .withColumn("c_PM25", df["c_PM25"].cast(DoubleType())) \
    .withColumn("c_PM10", df["c_PM10"].cast(DoubleType())) \
    .withColumn("c_O3", df["c_O3"].cast(DoubleType())) \
    .withColumn("c_NO2", df["c_NO2"].cast(DoubleType())) \
    .withColumn("geometry", df["geometry"].cast(StringType()))\
    .withColumn("DEN_REG", df["DEN_REG"].cast(StringType()))

df = df.withColumnRenamed("c_PM25", "c_PM2_5")

df.show()

df.printSchema()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/06/19 10:07:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/19 10:07:21 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/06/19 10:07:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/06/19 10:07:22 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


[Stage 0:>                                                          (0 + 1) / 1]

+--------+---------+----+-------------------+------------------+------------------+------------------+------------------+--------------------+-------+
|       x|        y|   z|               time|           c_PM2_5|            c_PM10|              c_O3|             c_NO2|            geometry|DEN_REG|
+--------+---------+----+-------------------+------------------+------------------+------------------+------------------+--------------------+-------+
|702000.0|4696000.0|20.0|2021-12-11 00:00:00|         1.2706414|1.5203451000000001|60.837047999999996|         2.0367115|POINT (702000 469...|  Lazio|
|702000.0|4696000.0|20.0|2021-12-11 01:00:00|          1.230251|          1.375211|         57.657654|         2.0870926|POINT (702000 469...|  Lazio|
|702000.0|4696000.0|20.0|2021-12-11 02:00:00|1.9736630000000002|          2.107126|          54.05386|          2.587756|POINT (702000 469...|  Lazio|
|702000.0|4696000.0|20.0|2021-12-11 03:00:00|1.7445994999999999|1.8961723000000001|          5

                                                                                

In [1]:
#Campania

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyproj

# Create a SparkSession
spark = SparkSession.builder.appName("PM25").config('spark.sql.shuffle.partitions',500).config('spark.driver.maxResultSize', '10G') .config("spark.driver.memory", "32g").config("spark.executor.memory", "16g").config("spark.task.maxFailures", "10").config("spark.executor.instances", "16").config("spark.local.dir", "/afs/enea.it/por/user/nafis/PFS/tmp").getOrCreate()


# Load the data from the CSV file

schema = StructType(
   [StructField('x', DoubleType(), True),
    StructField('y', DoubleType(), True),
    StructField('z', DoubleType(), True),
    StructField('time', TimestampType(), True),
    StructField('c_PM25', DoubleType(), True),
    StructField('c_PM10', DoubleType(), True),
    StructField('c_O3', DoubleType(), True),
    StructField('c_NO2', DoubleType(), True),
    StructField('geometry', StringType(), True),
    StructField('index_right', IntegerType(), True),
    StructField('COD_RIP', IntegerType(), True),
    StructField('COD_REG', IntegerType(), True),
    StructField('DEN_REG', StringType(), True),
    StructField('Shape_Leng', DoubleType(), True),
    StructField('Shape_Area', DoubleType(), True),
   ]
  )

df = spark.read.format("csv").load("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/data_chunk/campania/", header=True, schema=schema)

# Select the columns in the desired order
df = df.select( 'x', 'y', 'z', 'time', 'c_PM25', 'c_PM10', 'c_O3', 'c_NO2', 'geometry', 'DEN_REG')


df = df.withColumn("x", df["x"].cast(DoubleType())) \
    .withColumn("y", df["y"].cast(DoubleType())) \
    .withColumn("z", df["z"].cast(DoubleType())) \
    .withColumn("time", df["time"].cast(TimestampType())) \
    .withColumn("c_PM25", df["c_PM25"].cast(DoubleType())) \
    .withColumn("c_PM10", df["c_PM10"].cast(DoubleType())) \
    .withColumn("c_O3", df["c_O3"].cast(DoubleType())) \
    .withColumn("c_NO2", df["c_NO2"].cast(DoubleType())) \
    .withColumn("geometry", df["geometry"].cast(StringType()))\
    .withColumn("DEN_REG", df["DEN_REG"].cast(StringType()))

df = df.withColumnRenamed("c_PM25", "c_PM2_5")

df.show()

df.printSchema()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/06/20 14:11:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/20 14:11:41 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/06/20 14:11:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/06/20 14:11:42 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


[Stage 0:>                                                          (0 + 1) / 1]

+--------+---------+----+-------------------+------------------+------------------+------------------+------------------+--------------------+--------+
|       x|        y|   z|               time|           c_PM2_5|            c_PM10|              c_O3|             c_NO2|            geometry| DEN_REG|
+--------+---------+----+-------------------+------------------+------------------+------------------+------------------+--------------------+--------+
|902000.0|4576000.0|20.0|2021-05-21 00:00:00|2.5389232999999995|         2.9607124|         102.89293|         2.4104354|POINT (902000 457...|Campania|
|902000.0|4576000.0|20.0|2021-05-21 01:00:00|         2.5921166|         2.9856164| 98.94353000000001|2.6902790000000003|POINT (902000 457...|Campania|
|902000.0|4576000.0|20.0|2021-05-21 02:00:00|2.5095107999999997|2.8129014999999997|102.21184000000001|2.9086206000000003|POINT (902000 457...|Campania|
|902000.0|4576000.0|20.0|2021-05-21 03:00:00|         2.4757087|         2.7655613|     

                                                                                

In [2]:
data = df.withColumnRenamed("time", "original_date_time")

data_PM25 = data.select(col("original_date_time"),col("c_PM2_5"), col("x"), col("y"))
data_PM25.show()

+-------------------+------------------+--------+---------+
| original_date_time|           c_PM2_5|       x|        y|
+-------------------+------------------+--------+---------+
|2021-05-21 00:00:00|2.5389232999999995|902000.0|4576000.0|
|2021-05-21 01:00:00|         2.5921166|902000.0|4576000.0|
|2021-05-21 02:00:00|2.5095107999999997|902000.0|4576000.0|
|2021-05-21 03:00:00|         2.4757087|902000.0|4576000.0|
|2021-05-21 04:00:00|2.4431244999999997|902000.0|4576000.0|
|2021-05-21 05:00:00|          2.602668|902000.0|4576000.0|
|2021-05-21 06:00:00|         2.9900506|902000.0|4576000.0|
|2021-05-21 07:00:00|         3.1773403|902000.0|4576000.0|
|2021-05-21 08:00:00|2.9301939999999997|902000.0|4576000.0|
|2021-05-21 09:00:00|2.4819592999999998|902000.0|4576000.0|
|2021-05-21 10:00:00|1.9674328999999997|902000.0|4576000.0|
|2021-05-21 11:00:00|         1.7692596|902000.0|4576000.0|
|2021-05-21 12:00:00|1.7476258000000002|902000.0|4576000.0|
|2021-05-21 13:00:00|2.1857892999999997|

In [3]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
import time
import seaborn as sns
import matplotlib.pyplot as plt
import os.path

In [4]:
# Filter the dataframe
train = data_PM25.filter(col("original_date_time") <= "2021-06-30 23:00:00")
test = data_PM25.filter(col("original_date_time") > "2021-06-30 23:00:00")

# Convert timestamp column to UNIX timestamp
train = train.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))
test = test.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))


# Prepare the input data for modeling
assembler = VectorAssembler(inputCols=["original_date_time", "c_PM2_5", "x", "y"], outputCol="features")
train = assembler.transform(train)
test = assembler.transform(test)

In [5]:
train.count()

                                                                                

18913548

In [6]:
test.count()

                                                                                

7583652

# Decision Tree

In [5]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, unix_timestamp
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics
import time
import os.path
import seaborn as sns
import matplotlib.pyplot as plt

# Filter the dataframe
train_dt = data_PM25.filter(col("original_date_time") <= "2021-06-30 23:00:00")
test_dt = data_PM25.filter(col("original_date_time") > "2021-06-30 23:00:00")

# Convert timestamp column to UNIX timestamp
train_dt = train_dt.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))
test_dt = test_dt.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))

# Prepare the input data for modeling
assembler_dt = VectorAssembler(inputCols=["original_date_time", "c_PM2_5", "x", "y"], outputCol="features")
train_dt = assembler_dt.transform(train_dt)
test_dt = assembler_dt.transform(test_dt)

# Create a Decision Tree Regression model
dt = DecisionTreeRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_output_path = "output/decision_tree_regression_model_PM2_5/"

# create the output directory if it does not exist
if not os.path.exists(model_output_path):
    os.makedirs(model_output_path)

# specify the output path for the forecast
forecast_output_path = "output/decision_tree_regression_forecast_PM2_5/"

# create the output directory if it does not exist
if not os.path.exists(forecast_output_path):
    os.makedirs(forecast_output_path)


# Start the timer
training_start_time_dt = time.time()

# Fit the model to the training data
model_dt = dt.fit(train_dt)

# Stop the timer
training_end_time_dt = time.time()


# write the model to your personal directory
model_dt.write().overwrite().save(model_output_path)


# Start the timer
forecast_start_time_dt = time.time()

# Make predictions on the test data
predictions_dt = model_dt.transform(test_dt)

# Stop the timer
forecast_end_time_dt = time.time()


# write the forecast to your personal directory
predictions_dt.write.mode("overwrite").parquet(forecast_output_path)


# Calculate the time taken for training
training_time_dt = training_end_time_dt - training_start_time_dt
print("Time taken for training: ", training_time_dt)

# Calculate the time taken for forecasting
forecast_time_dt = forecast_end_time_dt - forecast_start_time_dt
print("Time taken for forecasting: ", forecast_time_dt)


Time taken for training:  16063.485694169998
Time taken for forecasting:  0.1763441562652588


In [8]:
from pyspark.ml.regression import DecisionTreeRegressionModel

# Load the saved decision tree regression model
model_dt = DecisionTreeRegressionModel.load("output/decision_tree_regression_model_PM2_5/")

# make predictions on the training data
predictions_train_dt = model_dt.transform(train)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "rmse"})
r2_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "r2"})
mae_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "mae"})
mse_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "mse"})

In [9]:
# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_dt_train}")
print(f"R2 (Train Data): {r2_dt_train}")
print(f"MSE (Train Data): {mse_dt_train}")
print(f"MAE (Train Data): {mae_dt_train}")

RMSE (Train Data): 1.579756501786754
R2 (Train Data): 0.9384725575786191
MSE (Train Data): 2.4956306049375216
MAE (Train Data): 0.4241766044094081


In [6]:
from pyspark.ml.evaluation import RegressionEvaluator

# Load the saved predictions from the Parquet file
predictions_dt = spark.read.parquet(forecast_output_path)

# Create a Regression Evaluator
evaluator_dt = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_dt = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_dt = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "r2"})

# Compute MSE (mean squared error)
mse_dt = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_dt = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_dt}")
print(f"R2 (Prediction Data): {r2_dt}")
print(f"MSE (Prediction Data): {mse_dt}")
print(f"MAE (Prediction Data): {mae_dt}")


RMSE (Prediction Data): 1.1953295256646943
R2 (Prediction Data): 0.9457080666860506
MSE (Prediction Data): 1.4288126749257832
MAE (Prediction Data): 0.3204468954629573


Lombardia

In [6]:
# Create a Decision Tree Regression model
dt = DecisionTreeRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_dt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi/output/decision_tree_regression_model_PM2_5_lombardia/"

# create the output directory if it does not exist
if not os.path.exists(model_dt_output_path):
    os.makedirs(model_dt_output_path)

# specify the output path for the forecast
forecast_dt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi/output/decision_tree_regression_forecast_PM2_5_lombardia/"

# create the output directory if it does not exist
if not os.path.exists(forecast_dt_output_path):
    os.makedirs(forecast_dt_output_path)


# Start the timer
training_start_time_dt = time.time()

# Fit the model to the training data
model_dt = dt.fit(train)

# Stop the timer
training_end_time_dt = time.time()


# write the model to your personal directory
model_dt.write().overwrite().save(model_dt_output_path)


# make predictions on the training data
predictions_train_dt = model_dt.transform(train)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "rmse"})
r2_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "r2"})
mae_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "mae"})
mse_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "mse"})

# print the results
# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_dt_train}")
print(f"R2 (Train Data): {r2_dt_train}")
print(f"MSE (Train Data): {mse_dt_train}")
print(f"MAE (Train Data): {mae_dt_train}")


# Start the timer
forecast_start_time_dt = time.time()

# Make predictions on the test data
predictions_dt = model_dt.transform(test)

# Stop the timer
forecast_end_time_dt = time.time()


# write the forecast to your personal directory
predictions_dt.write.mode("overwrite").parquet(forecast_dt_output_path)


# Calculate the time taken for training
training_time_dt = training_end_time_dt - training_start_time_dt
print("Time taken for training: ", training_time_dt)

# Calculate the time taken for forecasting
forecast_time_dt = forecast_end_time_dt - forecast_start_time_dt
print("Time taken for forecasting: ", forecast_time_dt)


# Load the saved predictions from the Parquet file
predictions_dt = spark.read.parquet(forecast_dt_output_path)

# Create a Regression Evaluator
evaluator_dt = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "r2"})

# Compute MSE (mean squared error)
mse_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_dt_test}")
print(f"R2 (Prediction Data): {r2_dt_test}")
print(f"MSE (Prediction Data): {mse_dt_test}")
print(f"MAE (Prediction Data): {mae_dt_test}")

                                                                                

RMSE (Train Data): 4.104373392309294
R2 (Train Data): 0.9210679518881306
MSE (Train Data): 16.84588094349651
MAE (Train Data): 0.950471267920366


                                                                                

Time taken for training:  42.721378803253174
Time taken for forecasting:  0.0630192756652832




RMSE (Prediction Data): 1.7569088789617495
R2 (Prediction Data): 0.9754749669583337
MSE (Prediction Data): 3.086728808974631
MAE (Prediction Data): 0.5950311849676214


                                                                                

Lazio

In [7]:
# Create a Decision Tree Regression model
dt = DecisionTreeRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_dt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/decision_tree_regression_model_PM2_5_lazio/"

# create the output directory if it does not exist
if not os.path.exists(model_dt_output_path):
    os.makedirs(model_dt_output_path)

# specify the output path for the forecast
forecast_dt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/decision_tree_regression_forecast_PM2_5_lazio/"

# create the output directory if it does not exist
if not os.path.exists(forecast_dt_output_path):
    os.makedirs(forecast_dt_output_path)


# Start the timer
training_start_time_dt = time.time()

# Fit the model to the training data
model_dt = dt.fit(train)

# Stop the timer
training_end_time_dt = time.time()


# write the model to your personal directory
model_dt.write().overwrite().save(model_dt_output_path)


# make predictions on the training data
predictions_train_dt = model_dt.transform(train)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "rmse"})
r2_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "r2"})
mae_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "mae"})
mse_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "mse"})

# print the results
# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_dt_train}")
print(f"R2 (Train Data): {r2_dt_train}")
print(f"MSE (Train Data): {mse_dt_train}")
print(f"MAE (Train Data): {mae_dt_train}")


# Start the timer
forecast_start_time_dt = time.time()

# Make predictions on the test data
predictions_dt = model_dt.transform(test)

# Stop the timer
forecast_end_time_dt = time.time()


# write the forecast to your personal directory
predictions_dt.write.mode("overwrite").parquet(forecast_dt_output_path)


# Calculate the time taken for training
training_time_dt = training_end_time_dt - training_start_time_dt
print("Time taken for training: ", training_time_dt)

# Calculate the time taken for forecasting
forecast_time_dt = forecast_end_time_dt - forecast_start_time_dt
print("Time taken for forecasting: ", forecast_time_dt)


# Load the saved predictions from the Parquet file
predictions_dt = spark.read.parquet(forecast_dt_output_path)

# Create a Regression Evaluator
evaluator_dt = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "r2"})

# Compute MSE (mean squared error)
mse_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_dt_test}")
print(f"R2 (Prediction Data): {r2_dt_test}")
print(f"MSE (Prediction Data): {mse_dt_test}")
print(f"MAE (Prediction Data): {mae_dt_test}")

                                                                                

RMSE (Train Data): 3.484062925084526
R2 (Train Data): 0.8185710297254536
MSE (Train Data): 12.138694465948543
MAE (Train Data): 0.6169562713998137


                                                                                

Time taken for training:  405.41633892059326
Time taken for forecasting:  0.09121894836425781




RMSE (Prediction Data): 1.6239749597175892
R2 (Prediction Data): 0.9126985149180113
MSE (Prediction Data): 2.637294669789744
MAE (Prediction Data): 0.35226079561529766




Campania

In [7]:
# Create a Decision Tree Regression model
dt = DecisionTreeRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_dt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/decision_tree_regression_model_PM2_5_campania/"

# create the output directory if it does not exist
if not os.path.exists(model_dt_output_path):
    os.makedirs(model_dt_output_path)

# specify the output path for the forecast
forecast_dt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/decision_tree_regression_forecast_PM2_5_campania/"

# create the output directory if it does not exist
if not os.path.exists(forecast_dt_output_path):
    os.makedirs(forecast_dt_output_path)


# Start the timer
training_start_time_dt = time.time()

# Fit the model to the training data
model_dt = dt.fit(train)

# Stop the timer
training_end_time_dt = time.time()


# write the model to your personal directory
model_dt.write().overwrite().save(model_dt_output_path)


# make predictions on the training data
predictions_train_dt = model_dt.transform(train)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "rmse"})
r2_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "r2"})
mae_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "mae"})
mse_dt_train = evaluator.evaluate(predictions_train_dt, {evaluator.metricName: "mse"})

# print the results
# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_dt_train}")
print(f"R2 (Train Data): {r2_dt_train}")
print(f"MSE (Train Data): {mse_dt_train}")
print(f"MAE (Train Data): {mae_dt_train}")


# Start the timer
forecast_start_time_dt = time.time()

# Make predictions on the test data
predictions_dt = model_dt.transform(test)

# Stop the timer
forecast_end_time_dt = time.time()


# write the forecast to your personal directory
predictions_dt.write.mode("overwrite").parquet(forecast_dt_output_path)


# Calculate the time taken for training
training_time_dt = training_end_time_dt - training_start_time_dt
print("Time taken for training: ", training_time_dt)

# Calculate the time taken for forecasting
forecast_time_dt = forecast_end_time_dt - forecast_start_time_dt
print("Time taken for forecasting: ", forecast_time_dt)


# Load the saved predictions from the Parquet file
predictions_dt = spark.read.parquet(forecast_dt_output_path)

# Create a Regression Evaluator
evaluator_dt = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "r2"})

# Compute MSE (mean squared error)
mse_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_dt_test = evaluator_dt.evaluate(predictions_dt, {evaluator_dt.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_dt_test}")
print(f"R2 (Prediction Data): {r2_dt_test}")
print(f"MSE (Prediction Data): {mse_dt_test}")
print(f"MAE (Prediction Data): {mae_dt_test}")

                                                                                

RMSE (Train Data): 2.225030727885437
R2 (Train Data): 0.912517125804106
MSE (Train Data): 4.950761740034398
MAE (Train Data): 0.48189363965038


                                                                                

Time taken for training:  87.42386412620544
Time taken for forecasting:  0.08373260498046875




RMSE (Prediction Data): 1.9608848171192985
R2 (Prediction Data): 0.9057409184182872
MSE (Prediction Data): 3.8450692660089847
MAE (Prediction Data): 0.37791542064818007


                                                                                

# Random Forest

In [8]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.sql.functions import unix_timestamp
import time
import os.path
import seaborn as sns
import matplotlib.pyplot as plt

# Filter the dataframe
train_rf = data_PM25.filter(col("original_date_time") <= "2021-06-30 23:00:00")
test_rf = data_PM25.filter(col("original_date_time") > "2021-06-30 23:00:00")

# Convert timestamp column to UNIX timestamp
train_rf = train_rf.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))
test_rf = test_rf.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))

# Prepare the input data for modeling
assembler_rf = VectorAssembler(inputCols=["original_date_time", "c_PM2_5", "x", "y"], outputCol="features")
train_rf = assembler_rf.transform(train_rf)
test_rf = assembler_rf.transform(test_rf)

# Create a Random Forest Regression model
rf = RandomForestRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5, numTrees=10)

# specify the output path for the model
model_rf_output_path = "output/random_forest_regression_model_PM2_5/"

# create the output directory if it does not exist
if not os.path.exists(model_rf_output_path):
    os.makedirs(model_rf_output_path)

# specify the output path for the forecast
forecast_rf_output_path = "output/random_forest_regression_forecast_PM2_5/"

# create the output directory if it does not exist
if not os.path.exists(forecast_rf_output_path):
    os.makedirs(forecast_rf_output_path)


# Start the timer
training_start_time_rf = time.time()

# Fit the model to the training data
model_rf = rf.fit(train_rf)

# Stop the timer
training_end_time_rf = time.time()


# write the model to your personal directory
model_rf.write().overwrite().save(model_rf_output_path)


# Start the timer
forecast_start_time_rf = time.time()

# Make predictions on the test data
predictions_rf = model_rf.transform(test_rf)

# Stop the timer
forecast_end_time_rf = time.time()


# write the forecast to your personal directory
predictions_rf.write.mode("overwrite").parquet(forecast_rf_output_path)


# Calculate the time taken for training
training_time_rf = training_end_time_rf - training_start_time_rf
print("Time taken for training: ", training_time_rf)

# Calculate the time taken for forecasting
forecast_time_rf = forecast_end_time_rf - forecast_start_time_rf
print("Time taken for forecasting: ", forecast_time_rf)


Time taken for training:  16598.03115916252
Time taken for forecasting:  0.05803322792053223


In [9]:
from pyspark.ml.evaluation import RegressionEvaluator

# Load the saved predictions from the Parquet file
predictions_rf = spark.read.parquet(forecast_rf_output_path)

# Create a Regression Evaluator
evaluator_rf = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_rf = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_rf = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "r2"})

# Compute MSE (mean squared error)
mse_rf = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_rf = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_rf}")
print(f"R2 (Prediction Data): {r2_rf}")
print(f"MSE (Prediction Data): {mse_rf}")
print(f"MAE (Prediction Data): {mae_rf}")


RMSE (Prediction Data): 1.8791842973232646
R2 (Prediction Data): 0.8658166091675981
MSE (Prediction Data): 3.5313336233063315
MAE (Prediction Data): 0.8737615381134364


In [10]:
# Create a Random Forest Regression model
rf = RandomForestRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_rf_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi/output/random_forest_regression_model_PM2_5/"

# create the output directory if it does not exist
if not os.path.exists(model_rf_output_path):
    os.makedirs(model_rf_output_path)

# specify the output path for the forecast
forecast_rf_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi/output/random_forest_regression_forecast_PM2_5/"

# create the output directory if it does not exist
if not os.path.exists(forecast_rf_output_path):
    os.makedirs(forecast_rf_output_path)

# Start the timer
training_start_time_rf = time.time()

# Fit the model to the training data
model_rf = rf.fit(train)

# Stop the timer
training_end_time_rf = time.time()

# write the model to your personal directory
model_rf.write().overwrite().save(model_rf_output_path)

# make predictions on the training data
predictions_train_rf = model_rf.transform(train)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "rmse"})
r2_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "r2"})
mae_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "mae"})
mse_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "mse"})

# print the results
# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_rf_train}")
print(f"R2 (Train Data): {r2_rf_train}")
print(f"MSE (Train Data): {mse_rf_train}")
print(f"MAE (Train Data): {mae_rf_train}")

# Start the timer
forecast_start_time_rf = time.time()

# Make predictions on the test data
predictions_rf = model_rf.transform(test)

# Stop the timer
forecast_end_time_rf = time.time()

# write the forecast to your personal directory
predictions_rf.write.mode("overwrite").parquet(forecast_rf_output_path)

# Calculate the time taken for training
training_time_rf = training_end_time_rf - training_start_time_rf
print("Time taken for training: ", training_time_rf)

# Calculate the time taken for forecasting
forecast_time_rf = forecast_end_time_rf - forecast_start_time_rf
print("Time taken for forecasting: ", forecast_time_rf)

# Load the saved predictions from the Parquet file
predictions_rf = spark.read.parquet(forecast_rf_output_path)

# Create a Regression Evaluator
evaluator_rf = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "r2"})

# Compute MSE (mean squared error)
mse_rf = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_rf = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_rf}")
print(f"R2 (Prediction Data): {r2_rf}")
print(f"MSE (Prediction Data): {mse_rf}")
print(f"MAE (Prediction Data): {mae_rf}")

RMSE (Train Data): 1.7451877482029674
R2 (Train Data): 0.9249115965022351
MSE (Train Data): 3.0456802764777433
MAE (Train Data): 0.8007551210692181
Time taken for training:  53020.54016709328
Time taken for forecasting:  0.08871793746948242


NameError: name 'rmse_rf' is not defined

In [12]:
#Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_rf_test}")
print(f"R2 (Prediction Data): {r2_rf_test}")
print(f"MSE (Prediction Data): {mse_rf}")
print(f"MAE (Prediction Data): {mae_rf}")

RMSE (Prediction Data): 1.375679891663427
R2 (Prediction Data): 0.928089088890567
MSE (Prediction Data): 1.8924951643270973
MAE (Prediction Data): 0.6485747354766993


Lombardia

In [8]:
# Create a Random Forest Regression model
rf = RandomForestRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_rf_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi/output/random_forest_regression_model_PM2_5_lombardia/"

# create the output directory if it does not exist
if not os.path.exists(model_rf_output_path):
    os.makedirs(model_rf_output_path)

# specify the output path for the forecast
forecast_rf_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi/output/random_forest_regression_forecast_PM2_5_lombardia/"

# create the output directory if it does not exist
if not os.path.exists(forecast_rf_output_path):
    os.makedirs(forecast_rf_output_path)

# Start the timer
training_start_time_rf = time.time()

# Fit the model to the training data
model_rf = rf.fit(train)

# Stop the timer
training_end_time_rf = time.time()

# write the model to your personal directory
model_rf.write().overwrite().save(model_rf_output_path)

# make predictions on the training data
predictions_train_rf = model_rf.transform(train)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "rmse"})
r2_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "r2"})
mae_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "mae"})
mse_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "mse"})

# print the results
# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_rf_train}")
print(f"R2 (Train Data): {r2_rf_train}")
print(f"MSE (Train Data): {mse_rf_train}")
print(f"MAE (Train Data): {mae_rf_train}")

# Start the timer
forecast_start_time_rf = time.time()

# Make predictions on the test data
predictions_rf = model_rf.transform(test)

# Stop the timer
forecast_end_time_rf = time.time()

# write the forecast to your personal directory
predictions_rf.write.mode("overwrite").parquet(forecast_rf_output_path)

# Calculate the time taken for training
training_time_rf = training_end_time_rf - training_start_time_rf
print("Time taken for training: ", training_time_rf)

# Calculate the time taken for forecasting
forecast_time_rf = forecast_end_time_rf - forecast_start_time_rf
print("Time taken for forecasting: ", forecast_time_rf)

# Load the saved predictions from the Parquet file
predictions_rf = spark.read.parquet(forecast_rf_output_path)

# Create a Regression Evaluator
evaluator_rf = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "r2"})

# Compute MSE (mean squared error)
mse_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_rf_test}")
print(f"R2 (Prediction Data): {r2_rf_test}")
print(f"MSE (Prediction Data): {mse_rf_test}")
print(f"MAE (Prediction Data): {mae_rf_test}")

                                                                                

RMSE (Train Data): 4.154095862913804
R2 (Train Data): 0.9191439216156252
MSE (Train Data): 17.256512438277582
MAE (Train Data): 1.656140783836397


                                                                                

Time taken for training:  594.769184589386
Time taken for forecasting:  0.08081960678100586




RMSE (Prediction Data): 2.369187273163974
R2 (Prediction Data): 0.9554025622568462
MSE (Prediction Data): 5.613048335322149
MAE (Prediction Data): 1.184421339037392


                                                                                

Lazio

In [8]:
# Create a Random Forest Regression model
rf = RandomForestRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_rf_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/random_forest_regression_model_PM2_5_lazio/"

# create the output directory if it does not exist
if not os.path.exists(model_rf_output_path):
    os.makedirs(model_rf_output_path)

# specify the output path for the forecast
forecast_rf_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/random_forest_regression_forecast_PM2_5_lazio/"

# create the output directory if it does not exist
if not os.path.exists(forecast_rf_output_path):
    os.makedirs(forecast_rf_output_path)

# Start the timer
training_start_time_rf = time.time()

# Fit the model to the training data
model_rf = rf.fit(train)

# Stop the timer
training_end_time_rf = time.time()

# write the model to your personal directory
model_rf.write().overwrite().save(model_rf_output_path)

# make predictions on the training data
predictions_train_rf = model_rf.transform(train)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "rmse"})
r2_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "r2"})
mae_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "mae"})
mse_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "mse"})

# print the results
# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_rf_train}")
print(f"R2 (Train Data): {r2_rf_train}")
print(f"MSE (Train Data): {mse_rf_train}")
print(f"MAE (Train Data): {mae_rf_train}")

# Start the timer
forecast_start_time_rf = time.time()

# Make predictions on the test data
predictions_rf = model_rf.transform(test)

# Stop the timer
forecast_end_time_rf = time.time()

# write the forecast to your personal directory
predictions_rf.write.mode("overwrite").parquet(forecast_rf_output_path)

# Calculate the time taken for training
training_time_rf = training_end_time_rf - training_start_time_rf
print("Time taken for training: ", training_time_rf)

# Calculate the time taken for forecasting
forecast_time_rf = forecast_end_time_rf - forecast_start_time_rf
print("Time taken for forecasting: ", forecast_time_rf)

# Load the saved predictions from the Parquet file
predictions_rf = spark.read.parquet(forecast_rf_output_path)

# Create a Regression Evaluator
evaluator_rf = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "r2"})

# Compute MSE (mean squared error)
mse_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_rf_test}")
print(f"R2 (Prediction Data): {r2_rf_test}")
print(f"MSE (Prediction Data): {mse_rf_test}")
print(f"MAE (Prediction Data): {mae_rf_test}")

                                                                                

RMSE (Train Data): 3.0870412257449322
R2 (Train Data): 0.857564083625617
MSE (Train Data): 9.529823529448773
MAE (Train Data): 1.235671157971072


                                                                                

Time taken for training:  246.0521638393402
Time taken for forecasting:  0.0782470703125




RMSE (Prediction Data): 2.3084129098695434
R2 (Prediction Data): 0.8236035001429058
MSE (Prediction Data): 5.328770162452373
MAE (Prediction Data): 0.8525831165106678


                                                                                

Campania

In [8]:
# Create a Random Forest Regression model
rf = RandomForestRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_rf_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/random_forest_regression_model_PM2_5_campania/"

# create the output directory if it does not exist
if not os.path.exists(model_rf_output_path):
    os.makedirs(model_rf_output_path)

# specify the output path for the forecast
forecast_rf_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/random_forest_regression_forecast_PM2_5_campania/"

# create the output directory if it does not exist
if not os.path.exists(forecast_rf_output_path):
    os.makedirs(forecast_rf_output_path)

# Start the timer
training_start_time_rf = time.time()

# Fit the model to the training data
model_rf = rf.fit(train)

# Stop the timer
training_end_time_rf = time.time()

# write the model to your personal directory
model_rf.write().overwrite().save(model_rf_output_path)

# make predictions on the training data
predictions_train_rf = model_rf.transform(train)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "rmse"})
r2_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "r2"})
mae_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "mae"})
mse_rf_train = evaluator.evaluate(predictions_train_rf, {evaluator.metricName: "mse"})

# print the results
# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_rf_train}")
print(f"R2 (Train Data): {r2_rf_train}")
print(f"MSE (Train Data): {mse_rf_train}")
print(f"MAE (Train Data): {mae_rf_train}")

# Start the timer
forecast_start_time_rf = time.time()

# Make predictions on the test data
predictions_rf = model_rf.transform(test)

# Stop the timer
forecast_end_time_rf = time.time()

# write the forecast to your personal directory
predictions_rf.write.mode("overwrite").parquet(forecast_rf_output_path)

# Calculate the time taken for training
training_time_rf = training_end_time_rf - training_start_time_rf
print("Time taken for training: ", training_time_rf)

# Calculate the time taken for forecasting
forecast_time_rf = forecast_end_time_rf - forecast_start_time_rf
print("Time taken for forecasting: ", forecast_time_rf)

# Load the saved predictions from the Parquet file
predictions_rf = spark.read.parquet(forecast_rf_output_path)

# Create a Regression Evaluator
evaluator_rf = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "r2"})

# Compute MSE (mean squared error)
mse_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_rf_test = evaluator_rf.evaluate(predictions_rf, {evaluator_rf.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_rf_test}")
print(f"R2 (Prediction Data): {r2_rf_test}")
print(f"MSE (Prediction Data): {mse_rf_test}")
print(f"MAE (Prediction Data): {mae_rf_test}")

                                                                                

RMSE (Train Data): 3.0609624128925077
R2 (Train Data): 0.8344355805984497
MSE (Train Data): 9.369490893140723
MAE (Train Data): 0.9995521299091124


                                                                                

Time taken for training:  435.608029127121
Time taken for forecasting:  0.15989041328430176


                                                                                

RMSE (Prediction Data): 2.761645736527433
R2 (Prediction Data): 0.8130373008114897
MSE (Prediction Data): 7.626687174080148
MAE (Prediction Data): 0.8474343328257676




# Gradient Boosting Regression

In [13]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.sql.functions import unix_timestamp
import time
import os.path
import seaborn as sns

# Filter the dataframe
train_gbt = data_PM25.filter(col("original_date_time") <= "2021-06-30 23:00:00")
test_gbt = data_PM25.filter(col("original_date_time") > "2021-06-30 23:00:00")

# Convert timestamp column to UNIX timestamp
train_gbt = train_gbt.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))
test_gbt = test_gbt.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))

# Prepare the input data for modeling
assembler_gbt = VectorAssembler(inputCols=["original_date_time", "x", "y", "c_PM2_5"], outputCol="features")
train_gbt = assembler_gbt.transform(train_gbt)
test_gbt = assembler_gbt.transform(test_gbt)

# Create a Gradient Boosted Regression model
gbt = GBTRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_gbt_output_path = "output/gradient_boosted_regression_model_PM2_5/"

# create the output directory if it does not exist
if not os.path.exists(model_gbt_output_path):
    os.makedirs(model_gbt_output_path)

# specify the output path for the forecast
forecast_gbt_output_path = "output/gradient_boosted_regression_forecast_PM2_5/"

# create the output directory if it does not exist
if not os.path.exists(forecast_gbt_output_path):
    os.makedirs(forecast_gbt_output_path)

# Start the timer
training_start_time_gbt = time.time()

# Fit the model to the training data
model_gbt = gbt.fit(train_gbt)

# Stop the timer
training_end_time_gbt = time.time()

# write the model to your personal directory
model_gbt.write().overwrite().save(model_gbt_output_path)

# make predictions on the training data
predictions_train_gbt = model_gbt.transform(train_gbt)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "rmse"})
r2_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "r2"})
mae_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "mae"})
mse_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "mse"})


# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_gbt_train}")
print(f"R2 (Train Data): {r2_gbt_train}")
print(f"MSE (Train Data): {mse_gbt_train}")
print(f"MAE (Train Data): {mae_gbt_train}")

# Start the timer
forecast_start_time_gbt = time.time()

# Make predictions on the test data
predictions_gbt = model_gbt.transform(test_gbt)

# Stop the timer
forecast_end_time_gbt = time.time()

# Calculate the time taken for training
training_time_gbt = training_end_time_gbt - training_start_time_gbt
print("Time taken for training: ", training_time_gbt)

# Calculate the time taken for forecasting
forecast_time_gbt = forecast_end_time_gbt - forecast_start_time_gbt
print("Time taken for forecasting: ", forecast_time_gbt)

# write the predictions to a Parquet file
predictions_gbt.write.mode("overwrite").parquet(forecast_gbt_output_path)

# Load the saved predictions from the Parquet file
predictions_gbt = spark.read.parquet(forecast_gbt_output_path)


# Create a Regression Evaluator
evaluator_gbt = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "r2"})

# Compute MSE (mean squared error)
mse_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_gbt}")
print(f"R2 (Prediction Data): {r2_gbt}")
print(f"MSE (Prediction Data): {mse_gbt}")
print(f"MAE (Prediction Data): {mae_gbt}")


RMSE (Train Data): 1.8795022641155679
R2 (Train Data): 0.9129088016860606
MSE (Train Data): 3.532528760815546
MAE (Train Data): 0.39689611936408903
Time taken for training:  170858.2653915882
Time taken for forecasting:  0.0811622142791748
RMSE (Prediction Data): 1.3689715569614558
R2 (Prediction Data): 0.9287887084387904
MSE (Prediction Data): 1.8740831237694722
MAE (Prediction Data): 0.26882700271977766


Lombardia

In [10]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.sql.functions import unix_timestamp
import time
import os.path
import seaborn as sns

# Filter the dataframe
train_gbt = data_PM25.filter(col("original_date_time") <= "2021-06-30 23:00:00")
test_gbt = data_PM25.filter(col("original_date_time") > "2021-06-30 23:00:00")

# Convert timestamp column to UNIX timestamp
train_gbt = train_gbt.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))
test_gbt = test_gbt.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))

# Prepare the input data for modeling
assembler_gbt = VectorAssembler(inputCols=["original_date_time", "x", "y", "c_PM2_5"], outputCol="features")
train_gbt = assembler_gbt.transform(train_gbt)
test_gbt = assembler_gbt.transform(test_gbt)

# Create a Gradient Boosted Regression model
gbt = GBTRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_gbt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi/output/gradient_boosted_regression_model_PM2_5_lombardia/"

# create the output directory if it does not exist
if not os.path.exists(model_gbt_output_path):
    os.makedirs(model_gbt_output_path)

# specify the output path for the forecast
forecast_gbt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi/output/gradient_boosted_regression_forecast_PM2_5_lombardia/"

# create the output directory if it does not exist
if not os.path.exists(forecast_gbt_output_path):
    os.makedirs(forecast_gbt_output_path)

# Start the timer
training_start_time_gbt = time.time()

# Fit the model to the training data
model_gbt = gbt.fit(train_gbt)

# Stop the timer
training_end_time_gbt = time.time()

# write the model to your personal directory
model_gbt.write().overwrite().save(model_gbt_output_path)

# make predictions on the training data
predictions_train_gbt = model_gbt.transform(train_gbt)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "rmse"})
r2_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "r2"})
mae_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "mae"})
mse_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "mse"})


# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_gbt_train}")
print(f"R2 (Train Data): {r2_gbt_train}")
print(f"MSE (Train Data): {mse_gbt_train}")
print(f"MAE (Train Data): {mae_gbt_train}")

# Start the timer
forecast_start_time_gbt = time.time()

# Make predictions on the test data
predictions_gbt = model_gbt.transform(test_gbt)

# Stop the timer
forecast_end_time_gbt = time.time()

# Calculate the time taken for training
training_time_gbt = training_end_time_gbt - training_start_time_gbt
print("Time taken for training: ", training_time_gbt)

# Calculate the time taken for forecasting
forecast_time_gbt = forecast_end_time_gbt - forecast_start_time_gbt
print("Time taken for forecasting: ", forecast_time_gbt)

# write the predictions to a Parquet file
predictions_gbt.write.mode("overwrite").parquet(forecast_gbt_output_path)

# Load the saved predictions from the Parquet file
predictions_gbt = spark.read.parquet(forecast_gbt_output_path)


# Create a Regression Evaluator
evaluator_gbt = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "r2"})

# Compute MSE (mean squared error)
mse_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_gbt}")
print(f"R2 (Prediction Data): {r2_gbt}")
print(f"MSE (Prediction Data): {mse_gbt}")
print(f"MAE (Prediction Data): {mae_gbt}")


                                                                                

RMSE (Train Data): 2.619836349014904
R2 (Train Data): 0.9678405974552927
MSE (Train Data): 6.863542495619741
MAE (Train Data): 0.7015082257323648
Time taken for training:  268.6059672832489
Time taken for forecasting:  0.0696861743927002




RMSE (Prediction Data): 1.3280798894211043
R2 (Prediction Data): 0.9859860834620167
MSE (Prediction Data): 1.7637961926847736
MAE (Prediction Data): 0.4521599083789563


                                                                                

Lazio

In [9]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.sql.functions import unix_timestamp
import time
import os.path
import seaborn as sns

# Filter the dataframe
train_gbt = data_PM25.filter(col("original_date_time") <= "2021-06-30 23:00:00")
test_gbt = data_PM25.filter(col("original_date_time") > "2021-06-30 23:00:00")

# Convert timestamp column to UNIX timestamp
train_gbt = train_gbt.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))
test_gbt = test_gbt.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))

# Prepare the input data for modeling
assembler_gbt = VectorAssembler(inputCols=["original_date_time", "x", "y", "c_PM2_5"], outputCol="features")
train_gbt = assembler_gbt.transform(train_gbt)
test_gbt = assembler_gbt.transform(test_gbt)

# Create a Gradient Boosted Regression model
gbt = GBTRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_gbt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/gradient_boosted_regression_model_PM2_5_lazio/"

# create the output directory if it does not exist
if not os.path.exists(model_gbt_output_path):
    os.makedirs(model_gbt_output_path)

# specify the output path for the forecast
forecast_gbt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/gradient_boosted_regression_forecast_PM2_5_lazio/"

# create the output directory if it does not exist
if not os.path.exists(forecast_gbt_output_path):
    os.makedirs(forecast_gbt_output_path)

# Start the timer
training_start_time_gbt = time.time()

# Fit the model to the training data
model_gbt = gbt.fit(train_gbt)

# Stop the timer
training_end_time_gbt = time.time()

# write the model to your personal directory
model_gbt.write().overwrite().save(model_gbt_output_path)

# make predictions on the training data
predictions_train_gbt = model_gbt.transform(train_gbt)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "rmse"})
r2_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "r2"})
mae_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "mae"})
mse_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "mse"})


# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_gbt_train}")
print(f"R2 (Train Data): {r2_gbt_train}")
print(f"MSE (Train Data): {mse_gbt_train}")
print(f"MAE (Train Data): {mae_gbt_train}")

# Start the timer
forecast_start_time_gbt = time.time()

# Make predictions on the test data
predictions_gbt = model_gbt.transform(test_gbt)

# Stop the timer
forecast_end_time_gbt = time.time()

# Calculate the time taken for training
training_time_gbt = training_end_time_gbt - training_start_time_gbt
print("Time taken for training: ", training_time_gbt)

# Calculate the time taken for forecasting
forecast_time_gbt = forecast_end_time_gbt - forecast_start_time_gbt
print("Time taken for forecasting: ", forecast_time_gbt)

# write the predictions to a Parquet file
predictions_gbt.write.mode("overwrite").parquet(forecast_gbt_output_path)

# Load the saved predictions from the Parquet file
predictions_gbt = spark.read.parquet(forecast_gbt_output_path)


# Create a Regression Evaluator
evaluator_gbt = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "r2"})

# Compute MSE (mean squared error)
mse_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_gbt}")
print(f"R2 (Prediction Data): {r2_gbt}")
print(f"MSE (Prediction Data): {mse_gbt}")
print(f"MAE (Prediction Data): {mae_gbt}")


                                                                                

RMSE (Train Data): 2.7342242273076223
R2 (Train Data): 0.8882614812844926
MSE (Train Data): 7.475982125195965
MAE (Train Data): 0.45500717181553746
Time taken for training:  520.8321993350983
Time taken for forecasting:  0.08784055709838867




RMSE (Prediction Data): 1.225698736012148
R2 (Prediction Data): 0.9502686268352143
MSE (Prediction Data): 1.5023373914617775
MAE (Prediction Data): 0.24872240748993493


                                                                                

Campania

In [7]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.sql.functions import unix_timestamp
import time
import os.path
import seaborn as sns

# Filter the dataframe
train_gbt = data_PM25.filter(col("original_date_time") <= "2021-06-30 23:00:00")
test_gbt = data_PM25.filter(col("original_date_time") > "2021-06-30 23:00:00")

# Convert timestamp column to UNIX timestamp
train_gbt = train_gbt.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))
test_gbt = test_gbt.withColumn("original_date_time", unix_timestamp(col("original_date_time")).cast("double"))

# Prepare the input data for modeling
assembler_gbt = VectorAssembler(inputCols=["original_date_time", "x", "y", "c_PM2_5"], outputCol="features")
train_gbt = assembler_gbt.transform(train_gbt)
test_gbt = assembler_gbt.transform(test_gbt)

# Create a Gradient Boosted Regression model
gbt = GBTRegressor(featuresCol='features', labelCol='c_PM2_5', maxDepth=5)

# specify the output path for the model
model_gbt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/gradient_boosted_regression_model_PM2_5_campania/"

# create the output directory if it does not exist
if not os.path.exists(model_gbt_output_path):
    os.makedirs(model_gbt_output_path)

# specify the output path for the forecast
forecast_gbt_output_path = "/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/gradient_boosted_regression_forecast_PM2_5_campania/"

# create the output directory if it does not exist
if not os.path.exists(forecast_gbt_output_path):
    os.makedirs(forecast_gbt_output_path)

# Start the timer
training_start_time_gbt = time.time()

# Fit the model to the training data
model_gbt = gbt.fit(train_gbt)

# Stop the timer
training_end_time_gbt = time.time()

# write the model to your personal directory
model_gbt.write().overwrite().save(model_gbt_output_path)

# make predictions on the training data
predictions_train_gbt = model_gbt.transform(train_gbt)

# evaluate the performance of the model on the training data
evaluator = RegressionEvaluator(labelCol="c_PM2_5", predictionCol="prediction")
rmse_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "rmse"})
r2_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "r2"})
mae_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "mae"})
mse_gbt_train = evaluator.evaluate(predictions_train_gbt, {evaluator.metricName: "mse"})


# Print the results with metric names including "Train Data"
print(f"RMSE (Train Data): {rmse_gbt_train}")
print(f"R2 (Train Data): {r2_gbt_train}")
print(f"MSE (Train Data): {mse_gbt_train}")
print(f"MAE (Train Data): {mae_gbt_train}")

# Start the timer
forecast_start_time_gbt = time.time()

# Make predictions on the test data
predictions_gbt = model_gbt.transform(test_gbt)

# Stop the timer
forecast_end_time_gbt = time.time()

# Calculate the time taken for training
training_time_gbt = training_end_time_gbt - training_start_time_gbt
print("Time taken for training: ", training_time_gbt)

# Calculate the time taken for forecasting
forecast_time_gbt = forecast_end_time_gbt - forecast_start_time_gbt
print("Time taken for forecasting: ", forecast_time_gbt)

# write the predictions to a Parquet file
predictions_gbt.write.mode("overwrite").parquet(forecast_gbt_output_path)

# Load the saved predictions from the Parquet file
predictions_gbt = spark.read.parquet(forecast_gbt_output_path)


# Create a Regression Evaluator
evaluator_gbt = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "r2"})

# Compute MSE (mean squared error)
mse_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_gbt = evaluator_gbt.evaluate(predictions_gbt, {evaluator_gbt.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_gbt}")
print(f"R2 (Prediction Data): {r2_gbt}")
print(f"MSE (Prediction Data): {mse_gbt}")
print(f"MAE (Prediction Data): {mae_gbt}")


                                                                                

RMSE (Train Data): 2.314559849692319
R2 (Train Data): 0.905335346957194
MSE (Train Data): 5.357187297807729
MAE (Train Data): 0.4263440603169816
Time taken for training:  145.261323928833
Time taken for forecasting:  0.07826852798461914


                                                                                

RMSE (Prediction Data): 2.2419016327761163
R2 (Prediction Data): 0.8767882452508445
MSE (Prediction Data): 5.026122931044217
MAE (Prediction Data): 0.31506644739244255


# Ensemble Machine learning

In [3]:
predictions_dt_ens = (
    spark.read.parquet("output/decision_tree_regression_forecast_PM2_5/")
    .withColumnRenamed("prediction", "prediction_dt")
)
predictions_rf_ens = (
    spark.read.parquet("/afs/enea.it/por/user/nafis/PFS/tmp/nafi/output/random_forest_regression_forecast_PM2_5/")
    .withColumnRenamed("prediction", "prediction_rf")
)
predictions_gbt_ens = (
    spark.read.parquet("output/gradient_boosted_regression_forecast_PM2_5/")
    .withColumnRenamed("prediction", "prediction_gbt")
)
predictions_gbt_ens.show(5)

from pyspark.ml.evaluation import RegressionEvaluator

# Take the simple average of the predictions
combined_predictions = (
    predictions_dt_ens
    .join(predictions_rf_ens, ["original_date_time", "x", "y", "c_PM2_5"])
    .join(predictions_gbt_ens, ["original_date_time", "x", "y", "c_PM2_5"])
    .withColumn("prediction", (col("prediction_dt") + col("prediction_rf") + col("prediction_gbt")) / 3.0)
    .select("original_date_time", "c_PM2_5", "x", "y", "prediction")
)
combined_predictions.show(5)

# Create a Regression Evaluator
evaluator_ens = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_ens = evaluator_ens.evaluate(combined_predictions, {evaluator_ens.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_ens = evaluator_ens.evaluate(combined_predictions, {evaluator_ens.metricName: "r2"})

# Compute MSE (mean squared error)
mse_ens = evaluator_ens.evaluate(combined_predictions, {evaluator_ens.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_ens = evaluator_ens.evaluate(combined_predictions, {evaluator_ens.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_ens}")
print(f"R2 (Prediction Data): {r2_ens}")
print(f"MSE (Prediction Data): {mse_ens}")
print(f"MAE (Prediction Data): {mae_ens}")

                                                                                

+------------------+---------+-----+------+--------------------+------------------+
|original_date_time|  c_PM2_5|    x|     y|            features|    prediction_gbt|
+------------------+---------+-----+------+--------------------+------------------+
|       1.6446204E9| 4.025283|266.0|4016.0|[1.6446204E9,266....|3.9895026955953923|
|        1.644624E9|4.2530026|266.0|4016.0|[1.644624E9,266.0...| 4.427801237154164|
|       1.6446276E9|4.7990746|266.0|4016.0|[1.6446276E9,266....|4.5823992290612185|
|       1.6446312E9| 5.260711|266.0|4016.0|[1.6446312E9,266....| 5.195336611671761|
|       1.6446348E9| 5.463522|266.0|4016.0|[1.6446348E9,266....| 5.195336611671761|
+------------------+---------+-----+------+--------------------+------------------+
only showing top 5 rows



                                                                                

+------------------+---------+-----+------+------------------+
|original_date_time|  c_PM2_5|    x|     y|        prediction|
+------------------+---------+-----+------+------------------+
|       1.6250904E9|1.8357738|266.0|4392.0| 1.938366765559559|
|       1.6250904E9|0.7923123|270.0|5024.0|  1.40429464357588|
|       1.6250904E9|1.8936615|274.0|4300.0|2.1362267899644922|
|       1.6250904E9|0.7861382|274.0|4980.0|1.4066969016487454|
|       1.6250904E9|0.7695013|278.0|4960.0|1.4119811359678487|
+------------------+---------+-----+------+------------------+
only showing top 5 rows



[Stage 21:>(0 + 96) / 151][Stage 22:> (0 + 0) / 151][Stage 23:> (0 + 0) / 155]1]

23/05/08 09:48:29 ERROR BlockManagerMasterEndpoint: Fail to know the executor driver is alive or not.
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:301)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.BlockManagerMasterEndpoint$$anonfun$handleBlockRemovalFailure$1.applyOrElse(BlockManagerMasterEndpoint.scala:239)
	at org.apache.spark.storage.BlockManagerMasterEndpoint

23/05/08 09:48:50 WARN NettyRpcEnv: Ignored message: true


[Stage 29:>(0 + 96) / 151][Stage 30:> (0 + 0) / 151][Stage 31:> (0 + 0) / 155]1]

23/05/08 11:46:04 ERROR BlockManagerMasterEndpoint: Fail to know the executor driver is alive or not.
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:301)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.BlockManagerMasterEndpoint$$anonfun$handleBlockRemovalFailure$1.applyOrElse(BlockManagerMasterEndpoint.scala:239)
	at org.apache.spark.storage.BlockManagerMasterEndpoint

23/05/08 11:46:08 WARN NettyRpcEnv: Ignored message: true




RMSE (Prediction Data): 1.1725690490868297
R2 (Prediction Data): 0.9477559442679713
MSE (Prediction Data): 1.3749181748763921
MAE (Prediction Data): 0.3221226197209398


                                                                                

Lombardia

In [2]:
predictions_dt_ens_lombardia = (
    spark.read.parquet("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/decision_tree_regression_forecast_PM2_5_lombardia/")
    .withColumnRenamed("prediction", "prediction_dt")
)
predictions_rf_ens_lombardia = (
    spark.read.parquet("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/random_forest_regression_forecast_PM2_5_lombardia/")
    .withColumnRenamed("prediction", "prediction_rf")
)
predictions_gbt_ens_lombardia = (
    spark.read.parquet("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/gradient_boosted_regression_forecast_PM2_5_lombardia/")
    .withColumnRenamed("prediction", "prediction_gbt")
)
predictions_gbt_ens_lombardia.show(5)

from pyspark.ml.evaluation import RegressionEvaluator

# Take the simple average of the predictions
combined_predictions_lombardia = (
    predictions_dt_ens_lombardia
    .join(predictions_rf_ens_lombardia, ["original_date_time", "x", "y", "c_PM2_5"])
    .join(predictions_gbt_ens_lombardia, ["original_date_time", "x", "y", "c_PM2_5"])
    .withColumn("prediction", (col("prediction_dt") + col("prediction_rf") + col("prediction_gbt")) / 3.0)
    .select("original_date_time", "c_PM2_5", "x", "y", "prediction")
)
combined_predictions_lombardia.show(5)


# Create a Regression Evaluator
evaluator_ens_lambordia = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_ens_lambordia = evaluator_ens_lambordia.evaluate(combined_predictions_lombardia, {evaluator_ens_lambordia.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_ens_lambordia = evaluator_ens_lambordia.evaluate(combined_predictions_lombardia, {evaluator_ens_lambordia.metricName: "r2"})

# Compute MSE (mean squared error)
mse_ens_lambordia = evaluator_ens_lambordia.evaluate(combined_predictions_lombardia, {evaluator_ens_lambordia.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_ens_lambordia = evaluator_ens_lambordia.evaluate(combined_predictions_lombardia, {evaluator_ens_lambordia.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_ens_lambordia}")
print(f"R2 (Prediction Data): {r2_ens_lambordia}")
print(f"MSE (Prediction Data): {mse_ens_lambordia}")
print(f"MAE (Prediction Data): {mae_ens_lambordia}")

                                                                                

+------------------+------------------+--------+---------+--------------------+------------------+
|original_date_time|           c_PM2_5|       x|        y|            features|    prediction_gbt|
+------------------+------------------+--------+---------+--------------------+------------------+
|       1.6390908E9|          28.36781|462000.0|5016000.0|[1.6390908E9,4620...|29.509494765047442|
|       1.6390944E9|24.782626999999998|462000.0|5016000.0|[1.6390944E9,4620...|25.770688211050427|
|        1.639098E9|         22.501345|462000.0|5016000.0|[1.639098E9,46200...|22.860911274046998|
|       1.6391016E9|         20.493872|462000.0|5016000.0|[1.6391016E9,4620...|20.618532512389326|
|       1.6391052E9|         18.949131|462000.0|5016000.0|[1.6391052E9,4620...|18.636246233624217|
+------------------+------------------+--------+---------+--------------------+------------------+
only showing top 5 rows



                                                                                

+------------------+------------------+--------+---------+------------------+
|original_date_time|           c_PM2_5|       x|        y|        prediction|
+------------------+------------------+--------+---------+------------------+
|       1.6250904E9|        0.80977404|482000.0|5088000.0|1.4367218441419283|
|       1.6250904E9|         1.0493861|486000.0|5060000.0|1.9152067457223598|
|       1.6250904E9|          2.703447|498000.0|5012000.0| 2.697859930249587|
|       1.6250904E9|2.5132527000000002|510000.0|5028000.0|2.7900458910407493|
|       1.6250904E9|          1.501193|518000.0|4972000.0|1.8873936130041187|
+------------------+------------------+--------+---------+------------------+
only showing top 5 rows



[Stage 43:>                                                       (0 + 10) / 10]

RMSE (Prediction Data): 1.3756927823676735
R2 (Prediction Data): 0.9849632477806485
MSE (Prediction Data): 1.8925306314585117
MAE (Prediction Data): 0.5007091074118767


                                                                                

Lazio

In [11]:
predictions_dt_ens_lazio = (
    spark.read.parquet("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/decision_tree_regression_forecast_PM2_5_lazio/")
    .withColumnRenamed("prediction", "prediction_dt")
)
predictions_rf_ens_lazio = (
    spark.read.parquet("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/random_forest_regression_forecast_PM2_5_lazio/")
    .withColumnRenamed("prediction", "prediction_rf")
)
predictions_gbt_ens_lazio = (
    spark.read.parquet("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/gradient_boosted_regression_forecast_PM2_5_lazio/")
    .withColumnRenamed("prediction", "prediction_gbt")
)
predictions_gbt_ens_lazio.show(5)

from pyspark.ml.evaluation import RegressionEvaluator

# Take the simple average of the predictions
combined_predictions_lazio = (
    predictions_dt_ens_lazio
    .join(predictions_rf_ens_lazio, ["original_date_time", "x", "y", "c_PM2_5"])
    .join(predictions_gbt_ens_lazio, ["original_date_time", "x", "y", "c_PM2_5"])
    .withColumn("prediction", (col("prediction_dt") + col("prediction_rf") + col("prediction_gbt")) / 3.0)
    .select("original_date_time", "c_PM2_5", "x", "y", "prediction")
)
combined_predictions_lazio.show(5)


# Create a Regression Evaluator
evaluator_ens_lazio = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_ens_lazio = evaluator_ens_lazio.evaluate(combined_predictions_lazio, {evaluator_ens_lazio.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_ens_lazio = evaluator_ens_lazio.evaluate(combined_predictions_lazio, {evaluator_ens_lazio.metricName: "r2"})

# Compute MSE (mean squared error)
mse_ens_lazio = evaluator_ens_lazio.evaluate(combined_predictions_lazio, {evaluator_ens_lazio.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_ens_lazio = evaluator_ens_lazio.evaluate(combined_predictions_lazio, {evaluator_ens_lazio.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_ens_lazio}")
print(f"R2 (Prediction Data): {r2_ens_lazio}")
print(f"MSE (Prediction Data): {mse_ens_lazio}")
print(f"MAE (Prediction Data): {mae_ens_lazio}")

                                                                                

+------------------+-----------------+--------+---------+--------------------+-----------------+
|original_date_time|          c_PM2_5|       x|        y|            features|   prediction_gbt|
+------------------+-----------------+--------+---------+--------------------+-----------------+
|       1.6295832E9|        3.9943345|702000.0|4696000.0|[1.6295832E9,7020...|4.005598390432201|
|       1.6295868E9|4.508026999999999|702000.0|4696000.0|[1.6295868E9,7020...|4.503454314178332|
|       1.6295904E9|        4.8864765|702000.0|4696000.0|[1.6295904E9,7020...|4.806861088845714|
|        1.629594E9|        5.1068788|702000.0|4696000.0|[1.629594E9,70200...|4.806861088845714|
|       1.6295976E9|         5.176116|702000.0|4696000.0|[1.6295976E9,7020...|5.361793711605291|
+------------------+-----------------+--------+---------+--------------------+-----------------+
only showing top 5 rows



                                                                                

+------------------+---------+--------+---------+------------------+
|original_date_time|  c_PM2_5|       x|        y|        prediction|
+------------------+---------+--------+---------+------------------+
|       1.6250904E9|3.4351873|734000.0|4684000.0|3.3198713808305236|
|       1.6250904E9|  2.70562|746000.0|4692000.0|2.7862845952834294|
|       1.6250904E9|2.2483215|758000.0|4652000.0| 2.456650295078754|
|       1.6250904E9| 2.263646|774000.0|4668000.0|2.4132768723066143|
|       1.6250904E9|2.5161965|778000.0|4692000.0|2.6230998518079645|
+------------------+---------+--------+---------+------------------+
only showing top 5 rows





RMSE (Prediction Data): 1.3497555798033924
R2 (Prediction Data): 0.9396922092084667
MSE (Prediction Data): 1.821840125210392
MAE (Prediction Data): 0.3099513955177884


                                                                                

Campania

In [8]:
predictions_dt_ens_campania = (
    spark.read.parquet("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/decision_tree_regression_forecast_PM2_5_campania/")
    .withColumnRenamed("prediction", "prediction_dt")
)
predictions_rf_ens_campania = (
    spark.read.parquet("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/random_forest_regression_forecast_PM2_5_campania/")
    .withColumnRenamed("prediction", "prediction_rf")
)
predictions_gbt_ens_campania = (
    spark.read.parquet("/afs/enea.it/por/user/nafis/PFS/tmp/nafi2/nafi/output/gradient_boosted_regression_forecast_PM2_5_campania/")
    .withColumnRenamed("prediction", "prediction_gbt")
)
predictions_gbt_ens_campania.show(5)

from pyspark.ml.evaluation import RegressionEvaluator

# Take the simple average of the predictions
combined_predictions_campania = (
    predictions_dt_ens_campania
    .join(predictions_rf_ens_campania, ["original_date_time", "x", "y", "c_PM2_5"])
    .join(predictions_gbt_ens_campania, ["original_date_time", "x", "y", "c_PM2_5"])
    .withColumn("prediction", (col("prediction_dt") + col("prediction_rf") + col("prediction_gbt")) / 3.0)
    .select("original_date_time", "c_PM2_5", "x", "y", "prediction")
)
combined_predictions_campania.show(5)


# Create a Regression Evaluator
evaluator_ens_campania = RegressionEvaluator(predictionCol="prediction", labelCol="c_PM2_5")

# Compute RMSE (root mean squared error)
rmse_ens_campania = evaluator_ens_campania.evaluate(combined_predictions_campania, {evaluator_ens_campania.metricName: "rmse"})

# Compute R2 (coefficient of determination)
r2_ens_campania = evaluator_ens_campania.evaluate(combined_predictions_campania, {evaluator_ens_campania.metricName: "r2"})

# Compute MSE (mean squared error)
mse_ens_campania = evaluator_ens_campania.evaluate(combined_predictions_campania, {evaluator_ens_campania.metricName: "mse"})

# Compute MAE (mean absolute error)
mae_ens_campania = evaluator_ens_campania.evaluate(combined_predictions_campania, {evaluator_ens_campania.metricName: "mae"})

# Print the results with metric names including "Prediction Data"
print(f"RMSE (Prediction Data): {rmse_ens_campania}")
print(f"R2 (Prediction Data): {r2_ens_campania}")
print(f"MSE (Prediction Data): {mse_ens_campania}")
print(f"MAE (Prediction Data): {mae_ens_campania}")

+------------------+------------------+--------+---------+--------------------+------------------+
|original_date_time|           c_PM2_5|       x|        y|            features|    prediction_gbt|
+------------------+------------------+--------+---------+--------------------+------------------+
|       1.6339032E9|          1.766012|902000.0|4576000.0|[1.6339032E9,9020...|1.7723182320484392|
|       1.6339068E9|2.0191703000000003|902000.0|4576000.0|[1.6339068E9,9020...|2.0334144358319204|
|       1.6339104E9|         2.3572073|902000.0|4576000.0|[1.6339104E9,9020...|2.2606373697290754|
|        1.633914E9|2.6865732999999996|902000.0|4576000.0|[1.633914E9,90200...| 2.651789357333835|
|       1.6339176E9|2.8155707999999997|902000.0|4576000.0|[1.6339176E9,9020...|2.8298444480703786|
+------------------+------------------+--------+---------+--------------------+------------------+
only showing top 5 rows



                                                                                

+------------------+------------------+--------+---------+------------------+
|original_date_time|           c_PM2_5|       x|        y|        prediction|
+------------------+------------------+--------+---------+------------------+
|       1.6250904E9|          2.039516|910000.0|4568000.0| 2.472822088430815|
|       1.6250904E9|1.9091046999999999|910000.0|4576000.0|2.4665097583009317|
|       1.6250904E9|1.8431568999999999|914000.0|4592000.0|2.3084423310166193|
|       1.6250904E9|         2.4410367|934000.0|4572000.0| 2.540594725755868|
|       1.6250904E9|         1.5820067|934000.0|4604000.0|2.2900944247825765|
+------------------+------------------+--------+---------+------------------+
only showing top 5 rows





RMSE (Prediction Data): 2.1836734371962385
R2 (Prediction Data): 0.8831054082102053
MSE (Prediction Data): 4.7684296803164345
MAE (Prediction Data): 0.3798199086679054




In [10]:
spark.stop()