In [0]:
import pandas as pd

In [0]:
from sklearn.datasets import load_boston

boston = load_boston()
X = pd.DataFrame(boston.data)
X.columns = [_.lower() for _ in boston.feature_names]
y = pd.DataFrame(boston.target)
y.columns=['target']
df = pd.concat([X,y], axis=1)
df.head(5)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [0]:
df.columns

In [0]:
df.info()

In [0]:
spark_df = spark.createDataFrame(df)
display(spark_df.head(5))

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,target
0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


#Pipeline

In [0]:
trainDF, testDF = spark_df.randomSplit([0.8, 0.2], seed=42)
print(trainDF.cache().count())
print(testDF.count())

In [0]:
display(trainDF.head(5))

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,target
0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
0.01311,90.0,1.22,0.0,0.403,7.249,21.9,8.6966,5.0,226.0,17.9,395.93,4.81,35.4
0.01432,100.0,1.32,0.0,0.411,6.816,40.5,8.3248,5.0,256.0,15.1,392.9,3.95,31.6
0.02055,85.0,0.74,0.0,0.41,6.383,35.7,9.1876,2.0,313.0,17.3,396.9,5.77,24.7
0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7


In [0]:
from pyspark.ml.feature import VectorAssembler

assemblerInputs = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax','ptratio', 'b', 'lstat']
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [0]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="target", maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [0]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

predDF = pipelineModel.transform(testDF)

In [0]:
display(predDF.select("features", "target", "prediction").show(5))

#No Pipeline

In [0]:
from pyspark.ml.feature import VectorAssembler

assemblerInputs = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax','ptratio', 'b', 'lstat']
vectorAssembler = VectorAssembler(inputCols = assemblerInputs, outputCol = 'features')
spark_df_v = vectorAssembler.transform(spark_df)
display(spark_df_v.head(5))

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,target,features
0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0,"Map(vectorType -> dense, length -> 13, values -> List(0.00632, 18.0, 2.31, 0.0, 0.538, 6.575, 65.2, 4.09, 1.0, 296.0, 15.3, 396.9, 4.98))"
0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,"Map(vectorType -> dense, length -> 13, values -> List(0.02731, 0.0, 7.07, 0.0, 0.469, 6.421, 78.9, 4.9671, 2.0, 242.0, 17.8, 396.9, 9.14))"
0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,"Map(vectorType -> dense, length -> 13, values -> List(0.02729, 0.0, 7.07, 0.0, 0.469, 7.185, 61.1, 4.9671, 2.0, 242.0, 17.8, 392.83, 4.03))"
0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,"Map(vectorType -> dense, length -> 13, values -> List(0.03237, 0.0, 2.18, 0.0, 0.458, 6.998, 45.8, 6.0622, 3.0, 222.0, 18.7, 394.63, 2.94))"
0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,"Map(vectorType -> dense, length -> 13, values -> List(0.06905, 0.0, 2.18, 0.0, 0.458, 7.147, 54.2, 6.0622, 3.0, 222.0, 18.7, 396.9, 5.33))"


In [0]:
trainDF, testDF = spark_df_v.randomSplit([0.8, 0.2], seed=42)
print(trainDF.cache().count())
print(testDF.count())

In [0]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', labelCol='target', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(trainDF)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [0]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [0]:
lr_predictions = lr_model.transform(testDF)
display(lr_predictions.select("target","features","prediction").show(5))

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="target",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

In [0]:
test_result = lr_model.evaluate(testDF)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)