# Spark-ML Regression

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler, StringIndexer
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, GBTRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
data_path = '/home/lorenzo/spark-repo/0_data/power.csv'

df = spark.read.option('header', 'True') \
                .option('inferSchema', 'True') \
                .csv(data_path)

df.show(10)

+-----+-----+-------+-----+------+
|   AT|    V|     AP|   RH|    PE|
+-----+-----+-------+-----+------+
|14.96|41.76|1024.07|73.17|463.26|
|25.18|62.96|1020.04|59.08|444.37|
| 5.11| 39.4|1012.16|92.14|488.56|
|20.86|57.32|1010.24|76.64|446.48|
|10.82| 37.5|1009.23|96.62| 473.9|
|26.27|59.44|1012.23|58.77|443.67|
|15.89|43.96|1014.02|75.24|467.35|
| 9.48|44.71|1019.12|66.43|478.42|
|14.64| 45.0|1021.78|41.25|475.98|
|11.74|43.56|1015.14|70.72| 477.5|
+-----+-----+-------+-----+------+
only showing top 10 rows



In [4]:
df.count()

9568

### Vectorized dataframe

In [5]:
va = VectorAssembler(inputCols=["AT", "V", "AP", "RH"], outputCol="features")
vect_df = va.transform(df)

In [6]:
vect_df.show(10)

+-----+-----+-------+-----+------+--------------------+
|   AT|    V|     AP|   RH|    PE|            features|
+-----+-----+-------+-----+------+--------------------+
|14.96|41.76|1024.07|73.17|463.26|[14.96,41.76,1024...|
|25.18|62.96|1020.04|59.08|444.37|[25.18,62.96,1020...|
| 5.11| 39.4|1012.16|92.14|488.56|[5.11,39.4,1012.1...|
|20.86|57.32|1010.24|76.64|446.48|[20.86,57.32,1010...|
|10.82| 37.5|1009.23|96.62| 473.9|[10.82,37.5,1009....|
|26.27|59.44|1012.23|58.77|443.67|[26.27,59.44,1012...|
|15.89|43.96|1014.02|75.24|467.35|[15.89,43.96,1014...|
| 9.48|44.71|1019.12|66.43|478.42|[9.48,44.71,1019....|
|14.64| 45.0|1021.78|41.25|475.98|[14.64,45.0,1021....|
|11.74|43.56|1015.14|70.72| 477.5|[11.74,43.56,1015...|
+-----+-----+-------+-----+------+--------------------+
only showing top 10 rows



### Train-test set split

In [7]:
train_df, test_df = vect_df.randomSplit([0.75, 0.25], 1)
print(f'Train df length: {train_df.count()}')
print(f'Test df length: {test_df.count()}')

Train df length: 7155
Test df length: 2413


### Linear Regression

In [8]:
lr = LinearRegression(featuresCol='features', labelCol='PE')
lr_model = lr.fit(train_df)
lr_preds = lr_model.transform(test_df)
lr_preds.show(10)

+----+-----+-------+-----+------+--------------------+------------------+
|  AT|    V|     AP|   RH|    PE|            features|        prediction|
+----+-----+-------+-----+------+--------------------+------------------+
|2.71|39.42|1026.66|81.11| 489.3|[2.71,39.42,1026....|490.82415199958245|
|3.26|41.31| 996.32|100.0|489.38|[3.26,41.31,996.3...| 484.6680881425009|
| 3.6|35.19|1018.73| 99.1|488.98|[3.6,35.19,1018.7...|  486.810803461217|
|3.68|39.64|1011.31|84.05|490.02|[3.68,39.64,1011....| 487.5612268795559|
|3.73|39.42| 1024.4|82.42|488.58|[3.73,39.42,1024....| 488.4824229410993|
|3.91|35.47|1016.92|86.03|488.67|[3.91,35.47,1016....| 488.0916328544706|
|3.99| 39.9|1009.74|96.81|490.91|[3.99,39.9,1009.7...|  484.793805921224|
|4.08|35.19|1018.87|97.07|489.44|[4.08,35.19,1018....|486.19077778282553|
|4.15| 39.9|1007.62|95.69| 489.8|[4.15,39.9,1007.6...|484.53923656301646|
|4.16|35.47|1017.72|88.49| 486.7|[4.16,35.47,1017....| 487.2545181467404|
+----+-----+-------+-----+------+-----

In [9]:
lr_model.coefficients

DenseVector([-1.9733, -0.2377, 0.0543, -0.1574])

In [10]:
lr_model.intercept

462.54757449414075

In [11]:
lr_model.summary.rootMeanSquaredError

4.5899552591471

###  Random Forest

In [12]:
rf = RandomForestRegressor(featuresCol='features', labelCol='PE', numTrees=100, seed = 352)
rf_model = rf.fit(train_df)
rf_preds = rf_model.transform(test_df)
rf_preds.show(10)

+----+-----+-------+-----+------+--------------------+------------------+
|  AT|    V|     AP|   RH|    PE|            features|        prediction|
+----+-----+-------+-----+------+--------------------+------------------+
|2.71|39.42|1026.66|81.11| 489.3|[2.71,39.42,1026....|483.71364121704397|
|3.26|41.31| 996.32|100.0|489.38|[3.26,41.31,996.3...| 482.2969900958718|
| 3.6|35.19|1018.73| 99.1|488.98|[3.6,35.19,1018.7...|482.85797131897345|
|3.68|39.64|1011.31|84.05|490.02|[3.68,39.64,1011....|484.08564411681783|
|3.73|39.42| 1024.4|82.42|488.58|[3.73,39.42,1024....| 483.7607579896588|
|3.91|35.47|1016.92|86.03|488.67|[3.91,35.47,1016....| 484.1826771168146|
|3.99| 39.9|1009.74|96.81|490.91|[3.99,39.9,1009.7...|483.52673851896515|
|4.08|35.19|1018.87|97.07|489.44|[4.08,35.19,1018....|482.85797131897345|
|4.15| 39.9|1007.62|95.69| 489.8|[4.15,39.9,1007.6...|483.16941713315913|
|4.16|35.47|1017.72|88.49| 486.7|[4.16,35.47,1017....| 484.1410256664014|
+----+-----+-------+-----+------+-----

In [13]:
evaluator = RegressionEvaluator(labelCol='PE', predictionCol='prediction', metricName='rmse')
rf_rmse = evaluator.evaluate(rf_preds)
print(f'Random forest RMSE: {rf_rmse}')

Random forest RMSE: 4.218899624237537


### Gradient Boosting

In [14]:
gbt = GBTRegressor(featuresCol='features', labelCol='PE')
gbt_model = gbt.fit(train_df)
gbt_preds = gbt_model.transform(test_df)
gbt_preds.show(10)

+----+-----+-------+-----+------+--------------------+------------------+
|  AT|    V|     AP|   RH|    PE|            features|        prediction|
+----+-----+-------+-----+------+--------------------+------------------+
|2.71|39.42|1026.66|81.11| 489.3|[2.71,39.42,1026....|485.05875573066527|
|3.26|41.31| 996.32|100.0|489.38|[3.26,41.31,996.3...|485.66090010615886|
| 3.6|35.19|1018.73| 99.1|488.98|[3.6,35.19,1018.7...|487.15563773870065|
|3.68|39.64|1011.31|84.05|490.02|[3.68,39.64,1011....|484.72530227228066|
|3.73|39.42| 1024.4|82.42|488.58|[3.73,39.42,1024....| 484.5339998437822|
|3.91|35.47|1016.92|86.03|488.67|[3.91,35.47,1016....|487.18747465964606|
|3.99| 39.9|1009.74|96.81|490.91|[3.99,39.9,1009.7...| 486.6459902874491|
|4.08|35.19|1018.87|97.07|489.44|[4.08,35.19,1018....|487.15563773870065|
|4.15| 39.9|1007.62|95.69| 489.8|[4.15,39.9,1007.6...| 486.5389263381428|
|4.16|35.47|1017.72|88.49| 486.7|[4.16,35.47,1017....| 487.1332222257253|
+----+-----+-------+-----+------+-----

In [15]:
evaluator = RegressionEvaluator(labelCol='PE', predictionCol='prediction', metricName='rmse')
gbt_rmse = evaluator.evaluate(gbt_preds)
print(f'Gradient boosting RMSE: {gbt_rmse}')

Gradient boosting RMSE: 3.9609843690603883
