In [1]:
import findspark
findspark.init('/usr/local/spark')
import pyspark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Linear Regression example").getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import *

In [4]:
data = spark.read.load("linregdata1.csv", format="csv", sep=",", inferSchema="true", header="true")
data.printSchema()

root
 |-- temperature: double (nullable = true)
 |-- exhaust_vacuum: double (nullable = true)
 |-- ambient_pressure: double (nullable = true)
 |-- relative_humidity: double (nullable = true)
 |-- energy_output: double (nullable = true)



In [5]:
data.describe()

DataFrame[summary: string, temperature: string, exhaust_vacuum: string, ambient_pressure: string, relative_humidity: string, energy_output: string]

In [6]:
features = ["temperature", "exhaust_vacuum", "ambient_pressure", "relative_humidity"]

In [7]:
lr_data = data.select(col("energy_output").alias("label"), *features)
lr_data.printSchema()

root
 |-- label: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- exhaust_vacuum: double (nullable = true)
 |-- ambient_pressure: double (nullable = true)
 |-- relative_humidity: double (nullable = true)



In [8]:
lr_data.show()

+------+-----------+--------------+----------------+-----------------+
| label|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|
+------+-----------+--------------+----------------+-----------------+
|480.48|       8.34|         40.77|         1010.84|            90.01|
|445.75|      23.64|         58.49|          1011.4|             74.2|
|438.76|      29.74|          56.9|         1007.15|            41.91|
|453.09|      19.07|         49.69|         1007.22|            76.79|
|464.43|       11.8|         40.66|         1017.13|             97.2|
|470.96|      13.97|         39.16|         1016.05|             84.6|
|442.35|       22.1|         71.29|          1008.2|            75.38|
| 464.0|      14.47|         41.76|         1021.98|            78.41|
|428.77|      31.25|         69.51|         1010.25|            36.83|
|484.31|       6.77|         38.18|          1017.8|            81.13|
|435.29|      28.28|         68.67|         1006.36|             69.9|
|451.4

VectorAssembler is a transformer that combines a given list of columns into a single vector column.

In [9]:
vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")

In [10]:
va_data = vectorAssembler.transform(lr_data)

In [11]:
va_data.show(truncate=False)

+------+-----------+--------------+----------------+-----------------+---------------------------+
|label |temperature|exhaust_vacuum|ambient_pressure|relative_humidity|unscaled_features          |
+------+-----------+--------------+----------------+-----------------+---------------------------+
|480.48|8.34       |40.77         |1010.84         |90.01            |[8.34,40.77,1010.84,90.01] |
|445.75|23.64      |58.49         |1011.4          |74.2             |[23.64,58.49,1011.4,74.2]  |
|438.76|29.74      |56.9          |1007.15         |41.91            |[29.74,56.9,1007.15,41.91] |
|453.09|19.07      |49.69         |1007.22         |76.79            |[19.07,49.69,1007.22,76.79]|
|464.43|11.8       |40.66         |1017.13         |97.2             |[11.8,40.66,1017.13,97.2]  |
|470.96|13.97      |39.16         |1016.05         |84.6             |[13.97,39.16,1016.05,84.6] |
|442.35|22.1       |71.29         |1008.2          |75.38            |[22.1,71.29,1008.2,75.38]  |
|464.0 |14

In [12]:
va_data.show?

StandardScaler transforms a dataset of Vector rows, normalizing each feature to have unit standard deviation or zero mean.
Uses 'withStd' by default i.e. scales the data to unit standard deviation.

In [13]:
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")

In [14]:
ss_model = standardScaler.fit(va_data)

In [15]:
ss_data = ss_model.transform(va_data)

In [16]:
ss_data.show(truncate=False)

+------+-----------+--------------+----------------+-----------------+---------------------------+-----------------------------------------------------------------------------+
|label |temperature|exhaust_vacuum|ambient_pressure|relative_humidity|unscaled_features          |features                                                                     |
+------+-----------+--------------+----------------+-----------------+---------------------------+-----------------------------------------------------------------------------+
|480.48|8.34       |40.77         |1010.84         |90.01            |[8.34,40.77,1010.84,90.01] |[1.1190915744403476,3.208242310929751,170.20993692880273,6.164955008688884]  |
|445.75|23.64      |58.49         |1011.4          |74.2             |[23.64,58.49,1011.4,74.2]  |[3.1721012973345104,4.602651281978933,170.30423233131958,5.082098229582438]  |
|438.76|29.74      |56.9          |1007.15         |41.91            |[29.74,56.9,1007.15,41.91] |[3.99062151365179

In [17]:
(training, test) = ss_data.randomSplit([.7, .3])

In [18]:
training.describe().show()

+-------+-----------------+------------------+------------------+------------------+------------------+
|summary|            label|       temperature|    exhaust_vacuum|  ambient_pressure| relative_humidity|
+-------+-----------------+------------------+------------------+------------------+------------------+
|  count|             6663|              6663|              6663|              6663|              6663|
|   mean|454.3386252438834|19.667819300615335|54.365883235779556|1013.1579348641765| 73.35753714542993|
| stddev|17.08457742969459| 7.468036446082565| 12.79789559883989| 5.905418548699453|14.590934967675901|
|    min|           420.26|              1.81|             25.36|            993.11|             25.56|
|    max|           495.76|             37.11|             80.25|            1033.3|            100.16|
+-------+-----------------+------------------+------------------+------------------+------------------+



In [19]:
test.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+
|summary|             label|       temperature|    exhaust_vacuum|  ambient_pressure| relative_humidity|
+-------+------------------+------------------+------------------+------------------+------------------+
|  count|              2905|              2905|              2905|              2905|              2905|
|   mean|454.42552495697134|19.613184165232372| 54.16800344234087|1013.4910636833039| 73.19760068846809|
| stddev|17.029381741611648| 7.417795677674531|12.500104900661233|  6.00920719221178|14.623559345784107|
|    min|            421.57|              2.34|             25.36|            992.89|              26.3|
|    max|            495.35|             35.77|             81.56|           1033.14|            100.15|
+-------+------------------+------------------+------------------+------------------+------------------+



In [20]:
lr = LinearRegression(maxIter=10, regParam=.01)

In [21]:
lr_model = lr.fit(training)

Now that the linear regression model is built we can apply it on the test data using transform method.
Before that we can look at the characteristics of our model i.e. coefficients and other parameters.

In [22]:
lr_model.coefficients

DenseVector([-14.5633, -3.1045, 0.3747, -2.3065])

In [23]:
lr_model.intercept

453.71245582189823

In [24]:
trainingSummary = lr_model.summary

In [25]:
trainingSummary.rootMeanSquaredError

4.559079784661985

In [26]:
trainingSummary.meanAbsoluteError

3.6425712243730457

In [27]:
trainingSummary.meanSquaredError

20.785208482913568

In [28]:
trainingSummary.r2

0.9287785047672007

In [29]:
prediction_df = lr_model.transform(test)

In [30]:
prediction_df.show(truncate=False)

+------+-----------+--------------+----------------+-----------------+---------------------------+---------------------------------------------------------------------------+------------------+
|label |temperature|exhaust_vacuum|ambient_pressure|relative_humidity|unscaled_features          |features                                                                   |prediction        |
+------+-----------+--------------+----------------+-----------------+---------------------------+---------------------------------------------------------------------------+------------------+
|421.57|23.0       |66.05         |1020.61         |80.29            |[23.0,66.05,1020.61,80.29] |[3.0862237664422056,5.197557140959285,171.85505493342703,5.499213838991563]|444.34766660249943|
|425.11|32.56      |68.14         |1004.02         |35.04            |[32.56,68.14,1004.02,35.04]|[4.36901938414601,5.362021855941948,169.06155363386543,2.3999558216249137] |431.2571222848788 |
|425.14|29.67      |71.98     

In [31]:
prediction_df.select("label","prediction").show(truncate=False)

+------+------------------+
|label |prediction        |
+------+------------------+
|421.57|444.34766660249943|
|425.11|431.2571222848788 |
|425.14|430.8709414434672 |
|425.21|431.8257594465493 |
|425.21|429.43086414885784|
|425.29|433.78040475285763|
|425.61|427.6016457063664 |
|425.64|433.0896499748003 |
|425.66|430.83891143029285|
|425.74|430.461777859339  |
|425.75|432.0571565295414 |
|425.75|430.23223281414613|
|425.89|426.59497821455545|
|426.13|433.62288014207854|
|426.14|431.1211689892353 |
|426.15|430.3720065116232 |
|426.25|433.4632943699166 |
|426.25|431.44253906972045|
|426.31|430.0317309985653 |
|426.48|432.0194798661805 |
+------+------------------+
only showing top 20 rows



In [32]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

In [33]:
rmse = eval.evaluate(prediction_df)
print("RMSE: %.3f" % rmse)

RMSE: 4.556


In [34]:
mse = eval.evaluate(prediction_df, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

MSE: 20.758


In [35]:
mae = eval.evaluate(prediction_df, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

MAE: 3.602


In [36]:
r2 = eval.evaluate(prediction_df, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

r2: 0.928
