# Installing pyspark

In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#For ignoring warnings

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

# Using spark

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sc= SparkContext()
sqlContext = SQLContext(sc)

# Import the dataset

In [4]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('boston.csv')
df.take(1)

[Row(CRIM=0.00632, ZN=18.0, INDUS=2.309999943, CHAS=0, NOX=0.537999988, RM=6.574999809, AGE=65.19999695, DIS=4.090000153, RAD=1, TAX=296, PT=15.30000019, B=396.8999939, LSTAT=4.980000019, MV=24.0)]

import pandas as pd
numeric_features = [t[0] for t in df.dtypes if t[1] == 'int' or t[1] == 'double']
sampled_data = df.select(numeric_features).sample(False, 0.8).toPandas()
axs = pd.plotting.scatter_matrix(sampled_data, figsize=(10, 10))
n = len(sampled_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

# Correlations between Variables

In [5]:
import six
for i in df.columns:
    if not( isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to MV for ", i, df.stat.corr('MV',i))

Correlation to MV for  CRIM -0.3883046116575088
Correlation to MV for  ZN 0.36044534463752903
Correlation to MV for  INDUS -0.48372517128143383
Correlation to MV for  CHAS 0.17526017775291847
Correlation to MV for  NOX -0.4273207763683772
Correlation to MV for  RM 0.695359937127267
Correlation to MV for  AGE -0.37695456714288667
Correlation to MV for  DIS 0.24992873873512172
Correlation to MV for  RAD -0.3816262315669168
Correlation to MV for  TAX -0.46853593528654536
Correlation to MV for  PT -0.5077867038116085
Correlation to MV for  B 0.3334608226834164
Correlation to MV for  LSTAT -0.7376627294671615
Correlation to MV for  MV 1.0


# Using all the columns

In [6]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PT', 'B', 'LSTAT'], outputCol = 'features')
vdf = vectorAssembler.transform(df)
vdf = vdf.select(['features', 'MV'])
vdf.show(3)

+--------------------+-----------+
|            features|         MV|
+--------------------+-----------+
|[0.00632,18.0,2.3...|       24.0|
|[0.027310001,0.0,...|21.60000038|
|[0.02729,0.0,7.07...|34.70000076|
+--------------------+-----------+
only showing top 3 rows



# Train Test Split

In [7]:
splits = vdf.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

#Building a model

In [8]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='MV', maxIter=10, regParam=0.3, elasticNetParam=0.8)
model = lr.fit(train_df)
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

Coefficients: [-0.027231694707313438,0.022552069196056127,-0.018362432537588565,2.5509946817069387,-9.159424450652498,3.978030606816245,0.0,-0.7909335126264829,0.0,0.0,-0.7511871998419054,0.0072280184021368135,-0.5011152757414752]
Intercept: 23.23801602270609


# Evaluvation using RMSE and R2

In [9]:
Summary = model.summary
print("RMSE: %f" % Summary.rootMeanSquaredError)
print("r2: %f" % Summary.r2)

RMSE: 5.207487
r2: 0.695364


### Predicting the Test Data

In [10]:
lr_predictions = model.transform(test_df)
lr_predictions.select("prediction","MV","features").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="MV",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+-----------+--------------------+
|        prediction|         MV|            features|
+------------------+-----------+--------------------+
|31.647680578749416|32.70000076|[0.01301,35.0,1.5...|
|30.517181083151108|35.40000153|[0.01311,90.0,1.2...|
|18.335729635478845|18.89999962|[0.0136,75.0,4.0,...|
|38.571658042674926|       50.0|[0.01381,80.0,0.4...|
|27.869058280510902|       24.5|[0.01501,80.0,2.0...|
+------------------+-----------+--------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.779086


### RMSE of test data

In [11]:
test_result = model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 4.03269


In [12]:
print("numIterations: %d" % Summary.totalIterations)
print("objectiveHistory: %s" % str(Summary.objectiveHistory))
Summary.residuals.show()

numIterations: 10
objectiveHistory: [0.5, 0.43314656525713924, 0.24506450515830602, 0.22378534339796402, 0.19549117807054775, 0.1926053572487412, 0.1918812012271726, 0.19101245362855948, 0.19023955186321978, 0.18928626253377923, 0.18886187718088576]
+-------------------+
|          residuals|
+-------------------+
| -6.474308070325986|
| 0.8080355238402674|
| -6.000093687456907|
|-0.1515202599481178|
|-1.9250301729126065|
|  7.954412225206312|
|  8.479454699906903|
|  3.698601634149629|
| 1.1307630212278355|
| -3.421974195902031|
|  7.402916270006017|
|-1.2951905814446896|
| 10.231791792134622|
|  5.469913010725001|
| -9.557782443897082|
|-4.2281975102704585|
| 3.7860606298542336|
|-3.7434430265108567|
| 0.9344557204955883|
|-2.2317589913987668|
+-------------------+
only showing top 20 rows



In [13]:
predictions = model.transform(test_df)
predictions.select("prediction","MV","features").show()

+------------------+-----------+--------------------+
|        prediction|         MV|            features|
+------------------+-----------+--------------------+
|31.647680578749416|32.70000076|[0.01301,35.0,1.5...|
|30.517181083151108|35.40000153|[0.01311,90.0,1.2...|
|18.335729635478845|18.89999962|[0.0136,75.0,4.0,...|
|38.571658042674926|       50.0|[0.01381,80.0,0.4...|
|27.869058280510902|       24.5|[0.01501,80.0,2.0...|
| 26.49224555189765|24.70000076|[0.02055,85.0,0.7...|
|31.599287218422695|31.10000038|[0.02187,60.0,2.9...|
|25.904437407371883|26.60000038|[0.02899,40.0,1.2...|
|29.479476922108134|31.20000076|[0.03049,55.0,3.7...|
|29.377462352281388|33.40000153|[0.032370001,0.0,...|
|28.893097609600638|24.10000038|[0.034449998,82.5...|
|31.209400716215786|       28.5|[0.035020001,80.0...|
| 21.38681494580029|21.10000038|[0.039609998,0.0,...|
| 35.87903864934338|33.29999924|[0.040109999,80.0...|
| 25.23203171939989|20.60000038|[0.042939998,28.0...|
| 24.76852976993327|19.39999