In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark_conf = SparkConf()
spark_conf.setMaster('local[*]')
spark_conf.setAppName('Spark Price predictions')

<pyspark.conf.SparkConf at 0x1b0be326740>

In [3]:
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

In [15]:
data = spark.read.options(**{"inferSchema": True, "header": True}).csv("../train.csv")

selecting high correlation features based on previous statistic results

In [5]:
feature_target_list = ['YearRemodAdd',
                      'YearBuilt',
                      'TotRmsAbvGrd',
                      'FullBath',
                      '1stFlrSF',
                      'TotalBsmtSF',
                      'GarageArea',
                      'GarageCars',
                      'GrLivArea',
                      'OverallQual',
                      'SalePrice']

ML lib

In [53]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

Transforming df for ml 

In [24]:
vectorAssembler = VectorAssembler(inputCols=feature_target_list[:-2], outputCol='features')
vdf_train_high_corr = vectorAssembler.transform(data.select(feature_target_list))
vdf_train_high_corr.select(['features', 'SalePrice']).show(truncate=False)

+-------------------------------------------------------+---------+
|features                                               |SalePrice|
+-------------------------------------------------------+---------+
|[2003.0,2003.0,8.0,2.0,856.0,856.0,548.0,2.0,1710.0]   |208500   |
|[1976.0,1976.0,6.0,2.0,1262.0,1262.0,460.0,2.0,1262.0] |181500   |
|[2002.0,2001.0,6.0,2.0,920.0,920.0,608.0,2.0,1786.0]   |223500   |
|[1970.0,1915.0,7.0,1.0,961.0,756.0,642.0,3.0,1717.0]   |140000   |
|[2000.0,2000.0,9.0,2.0,1145.0,1145.0,836.0,3.0,2198.0] |250000   |
|[1995.0,1993.0,5.0,1.0,796.0,796.0,480.0,2.0,1362.0]   |143000   |
|[2005.0,2004.0,7.0,2.0,1694.0,1686.0,636.0,2.0,1694.0] |307000   |
|[1973.0,1973.0,7.0,2.0,1107.0,1107.0,484.0,2.0,2090.0] |200000   |
|[1950.0,1931.0,8.0,2.0,1022.0,952.0,468.0,2.0,1774.0]  |129900   |
|[1950.0,1939.0,5.0,1.0,1077.0,991.0,205.0,1.0,1077.0]  |118000   |
|[1965.0,1965.0,5.0,1.0,1040.0,1040.0,384.0,1.0,1040.0] |129500   |
|[2006.0,2005.0,11.0,3.0,1182.0,1175.0,736.0,3.0

Spliting between train and test

In [26]:
splits = vdf_train_high_corr.randomSplit(weights=[0.7, 0.3])
df_train = splits[0]
df_test = splits[1]

Basic Linear Regression

In [49]:
lr = LinearRegression(featuresCol= 'features', labelCol='SalePrice', maxIter=10, regParam=0, elasticNetParam=0)
lr_model = lr.fit(df_train)
display("Coefficients: " + str(lr_model.coefficients))
display("Intercept: " + str(lr_model.intercept))

'Coefficients: [522.4910441586145,480.14980939712626,-1148.2784287271422,-4790.477340576148,1.8933471049330377,35.28537342299903,4.814896790302039,20519.79694469149,73.29905802760118]'

'Intercept: -1977004.7392636617'

In [50]:
trainingSummary = lr_model.summary
display("RMSE: %f" % trainingSummary.rootMeanSquaredError)
display("r2: %f" % trainingSummary.r2)

RMSE: 42151.858129
r2: 0.722250


In [51]:
lr_predictions = lr_model.transform(df_test.select("features", "SalePrice"))
lr_predictions.select("prediction","SalePrice","features").show(truncate=False)

+------------------+---------+------------------------------------------------------+
|prediction        |SalePrice|features                                              |
+------------------+---------+------------------------------------------------------+
|90959.85818656604 |79000    |[1950.0,1900.0,6.0,1.0,889.0,540.0,352.0,1.0,1440.0]  |
|81960.53696068563 |102776   |[1950.0,1900.0,7.0,1.0,859.0,859.0,384.0,1.0,1178.0]  |
|138742.81239069998|122000   |[1950.0,1910.0,9.0,1.0,964.0,925.0,308.0,1.0,1889.0]  |
|125657.04468115792|130000   |[1950.0,1911.0,7.0,1.0,1024.0,940.0,0.0,0.0,1964.0]   |
|137705.50438091857|104000   |[1950.0,1912.0,8.0,1.0,929.0,755.0,0.0,0.0,2229.0]    |
|67263.23952225689 |58500    |[1950.0,1914.0,5.0,1.0,864.0,864.0,200.0,1.0,864.0]   |
|149897.3973721112 |153900   |[1950.0,1915.0,7.0,1.0,876.0,876.0,720.0,3.0,1416.0]  |
|113368.35550148785|139000   |[1950.0,1915.0,7.0,2.0,966.0,686.0,416.0,1.0,1652.0]  |
|81270.01397332363 |108500   |[1950.0,1920.0,5.0,1.0,9

1st test evaluation method with Regression Evaluator

In [40]:
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="SalePrice",metricName="r2")
display("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

R Squared (R2) on test data = 38914.8


2nd test evaluation with the evaluate method of the model

In [39]:
test_result = lr_model.evaluate(df_test)
display("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)
display("r2 on test data = %g" % test_result.r2)

Root Mean Squared Error (RMSE) on test data = 38914.8
r2 on test data = 0.751719


Lasso (L1) & Ridge (L2) Regressions <=> elasticNetParam isin {0, 1}

# Model tuning

In [52]:
lasso_regressor = LinearRegression(featuresCol= 'features', labelCol='SalePrice', maxIter=10, regParam=0.3, elasticNetParam=0)
ridge_regressor = LinearRegression(featuresCol= 'features', labelCol='SalePrice', maxIter=10, regParam=0.3, elasticNetParam=1)

tuning the L1 & L2 penalty parameter

In [84]:
# Define the hyperparameter grid
param_lasso_grid = ParamGridBuilder() \
    .addGrid(lasso_regressor.regParam, [i/100 for i in range(1, 101)]) \
    .build()
param_ridge_grid = ParamGridBuilder() \
    .addGrid(ridge_regressor.regParam, [i/100 for i in range(1, 101)]) \
    .build()
# Create the cross-validator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol= "SalePrice", metricName="rmse")
cross_lasso_validator = CrossValidator(estimator=lasso_regressor,
                                 estimatorParamMaps=param_lasso_grid,
                                 evaluator=evaluator,
                                 numFolds=4)
cross_ridge_validator = CrossValidator(estimator=ridge_regressor,
                                 estimatorParamMaps=param_ridge_grid,
                                 evaluator=evaluator,
                                 numFolds=4)

# Train the model with the best hyperparameters
cv_lasso_model = cross_lasso_validator.fit(vdf_train_high_corr)
cv_ridgemodel = cross_ridge_validator.fit(vdf_train_high_corr)
lasso_model = cv_lasso_model.bestModel
ridge_model = cv_ridgemodel.bestModel

In [85]:
test_result = lasso_model.evaluate(df_test)
display("Root Mean Squared Error (RMSE) for L1 regression on test data = %g" % test_result.rootMeanSquaredError)
display("r2 for L1 regression on test data = %g" % test_result.r2)
display("L1 Coefficients: " + str(lasso_model.coefficients))
display("L1 Intercept: " + str(lasso_model.intercept))
display(f"{'-'*200}")
test_result = ridge_model.evaluate(df_test)
display("Root Mean Squared Error (RMSE) for L2 regression on test data = %g" % test_result.rootMeanSquaredError)
display("r2 for L2 regression on test data = %g" % test_result.r2)
display("L2 Coefficients: " + str(ridge_model.coefficients))
display("L2 Intercept: " + str(ridge_model.intercept))
display(f"{'-'*200}")

'Root Mean Squared Error (RMSE) for L1 regression on test data = 38714.2'

'r2 for L1 regression on test data = 0.754272'

'L1 Coefficients: [545.6614517799065,462.1082118401447,-1058.2657506500384,-5185.525238359562,5.371284737560781,35.46216161264968,12.708039875139802,17080.990775699876,72.3308966828245]'

'L1 Intercept: -1987622.5477146052'

'--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'

'Root Mean Squared Error (RMSE) for L2 regression on test data = 38621.9'

'r2 for L2 regression on test data = 0.755442'

'L2 Coefficients: [559.9068401752854,499.74624041673945,-0.0,-7814.730476072821,9.117005457922453,32.91990550595245,22.644033135218134,13085.203754731148,70.4066191361842]'

'L2 Intercept: -2089263.9771832146'

'--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'

Tuning the [Elastic net](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf) parameter along with L1 & L2 penalty parameter

In [86]:
elastic_net_regressor = LinearRegression(featuresCol= 'features', labelCol='SalePrice', maxIter=10, regParam=0.3, elasticNetParam=0.3)

In [92]:
# Define the hyperparameter grid
param_elastic_net_grid = ParamGridBuilder() \
    .addGrid(elastic_net_regressor.regParam, [i/10 for i in range(1, 11)]) \
    .addGrid(elastic_net_regressor.elasticNetParam, [i/100 for i in range(1, 101)]) \
    .build()
# Create the cross-validator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol= "SalePrice", metricName="rmse")
cross_elastic_net_validator = CrossValidator(estimator=elastic_net_regressor,
                                 estimatorParamMaps=param_elastic_net_grid,
                                 evaluator=evaluator,
                                 numFolds=4)

# Train the model with the best hyperparameters
cv_elastic_netmodel = cross_elastic_net_validator.fit(vdf_train_high_corr)
elastic_net_model = cv_elastic_netmodel.bestModel

In [93]:
test_result = elastic_net_model.evaluate(df_test)
display("Root Mean Squared Error (RMSE) for elastic net regression on test data = %g" % test_result.rootMeanSquaredError)
display("r2 for elastic net regression on test data = %g" % test_result.r2)
display("elastic net Coefficients: " + str(elastic_net_model.coefficients))
display("elastic net Intercept: " + str(elastic_net_model.intercept))
display(f"{'-'*200}")

'Root Mean Squared Error (RMSE) for elastic net regression on test data = 38622'

'r2 for elastic net regression on test data = 0.755441'

'elastic net Coefficients: [559.9034282189132,499.7332361247397,-0.0,-7813.064554494469,9.118087976336266,32.918226592239634,22.6459917277541,13085.181709636448,70.4023426973258]'

'elastic net Intercept: -2089228.0671101795'

'--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'