In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

spark = SparkSession \
    .builder \
    .appName("Regression in PySpark") \
    .getOrCreate()

pp_df = spark.read.csv("/Users/kanp/Other Coding/Spark/LinkedIn_SparkforML/CCPP/power_plant.csv")
pp_df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string]

In [4]:
pp_df = spark.read.csv("/Users/kanp/Other Coding/Spark/LinkedIn_SparkforML/CCPP/power_plant.csv"
                       ,header=True
                       ,inferSchema=True)
pp_df

DataFrame[AT: double, V: double, AP: double, RH: double, PE: double]

In [5]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols=["AT","V","AP","RH"],outputCol="features")
vpp_df = vectorAssembler.transform(pp_df)
vpp_df.take(1)

[Row(AT=14.96, V=41.76, AP=1024.07, RH=73.17, PE=463.26, features=DenseVector([14.96, 41.76, 1024.07, 73.17]))]

In [7]:
#Linear regression
lr = LinearRegression(featuresCol="features",labelCol="PE")
lr_model = lr.fit(vpp_df)
lr_model.coefficients

DenseVector([-1.9775, -0.2339, 0.0621, -0.1581])

In [8]:
lr_model.intercept

454.6092744523414

In [10]:
lr_model.summary.rootMeanSquaredError

4.557126016749488

In [11]:
lr_model.save("lr1.model")

In [15]:
#Decision tree regression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

splits = vpp_df.randomSplit([0.7,0.3])
train_df = splits[0]
test_df = splits[1]
train_df.count()
test_df.count()
vpp_df.count()

9568

In [18]:
dt = DecisionTreeRegressor(featuresCol="features",labelCol="PE")
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_predictions.show()

+----+-----+-------+-----+------+--------------------+-----------------+
|  AT|    V|     AP|   RH|    PE|            features|       prediction|
+----+-----+-------+-----+------+--------------------+-----------------+
|2.34|39.42|1028.47|69.68|490.34|[2.34,39.42,1028....|487.2847826086957|
|2.64|39.64|1011.02|85.24|481.29|[2.64,39.64,1011....|487.2847826086957|
| 3.4|39.64| 1011.1|83.43|459.86|[3.4,39.64,1011.1...|487.2847826086957|
|3.63|38.44|1016.16|87.38|487.87|[3.63,38.44,1016....|487.2847826086957|
| 4.0| 39.9|1008.46|97.21|490.68|[4.0,39.9,1008.46...|487.2847826086957|
|4.04|35.47|1017.51|87.35|486.86|[4.04,35.47,1017....|487.2847826086957|
|4.16|35.47|1017.72|88.49| 486.7|[4.16,35.47,1017....|487.2847826086957|
|4.32|35.47| 1017.8|88.51|488.03|[4.32,35.47,1017....|487.2847826086957|
|4.47|35.19|1018.78|92.68|487.43|[4.47,35.19,1018....|487.2847826086957|
|4.56|40.27|1011.13|80.24|493.87|[4.56,40.27,1011....|487.2847826086957|
|4.61|40.27|1012.32|77.28|492.85|[4.61,40.27,1012..

In [20]:
dt_evaluator = RegressionEvaluator(labelCol="PE",predictionCol="prediction",metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
rmse

4.462616881290915

In [21]:
#GBT
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol="features",labelCol="PE")
gbt_model = gbt.fit(train_df)
gbt_prediction = gbt_model.transform(test_df)
gbt_prediction.show()

+----+-----+-------+-----+------+--------------------+------------------+
|  AT|    V|     AP|   RH|    PE|            features|        prediction|
+----+-----+-------+-----+------+--------------------+------------------+
|2.34|39.42|1028.47|69.68|490.34|[2.34,39.42,1028....|486.72025037114895|
|2.64|39.64|1011.02|85.24|481.29|[2.64,39.64,1011....| 488.6760539996318|
| 3.4|39.64| 1011.1|83.43|459.86|[3.4,39.64,1011.1...|488.58321214975587|
|3.63|38.44|1016.16|87.38|487.87|[3.63,38.44,1016....|  488.401765462889|
| 4.0| 39.9|1008.46|97.21|490.68|[4.0,39.9,1008.46...| 488.6068085945434|
|4.04|35.47|1017.51|87.35|486.86|[4.04,35.47,1017....|487.59638617706605|
|4.16|35.47|1017.72|88.49| 486.7|[4.16,35.47,1017....|487.59638617706605|
|4.32|35.47| 1017.8|88.51|488.03|[4.32,35.47,1017....|487.59638617706605|
|4.47|35.19|1018.78|92.68|487.43|[4.47,35.19,1018....|487.59638617706605|
|4.56|40.27|1011.13|80.24|493.87|[4.56,40.27,1011....|488.34054418154017|
|4.61|40.27|1012.32|77.28|492.85|[4.61

In [22]:
gbt_evaluator = RegressionEvaluator(labelCol="PE",predictionCol="prediction",metricName="rmse")
gbt_rmse = gbt_evaluator.evaluate(gbt_prediction)
gbt_rmse

4.02026427710849