# **Linear Regression Model Implementation using Pyspark**

In [None]:
#Installation
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
#spark
import findspark
findspark.init()

In [None]:
#Creating Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("LinearRegressionwithSpark").getOrCreate()

# **Import Packages**

In [None]:
from pyspark import SparkFiles
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

# **Load Dataset**

In [None]:
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
spark.sparkContext.addFile(url)

data = spark.read.csv(SparkFiles.get("BostonHousing.csv"), header=True, inferSchema=True)
data.show(5)

+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185|61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998|45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147|54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
only showing top 5 rows



# **Prepare Data**

In [None]:
assembler = VectorAssembler(
    inputCols=["crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat"],
    outputCol="features")

data = assembler.transform(data)
final_data = data.select("features", "medv")

train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

# **Prepare The Model**

In [None]:
lr = LinearRegression(featuresCol="features", labelCol="medv", predictionCol="predicted_medv")
lr_model = lr.fit(train_data)

# **Evaluate Performance**

In [None]:
predictions = lr_model.transform(test_data)

evaluator = RegressionEvaluator(labelCol="medv", predictionCol="predicted_medv", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

evaluator_mse = RegressionEvaluator(labelCol="medv", predictionCol="predicted_medv", metricName="mse")
mse = evaluator_mse.evaluate(predictions)
print("MSE on test data: {:.3f}".format(mse))

Root Mean Squared Error (RMSE) on test data: 4.672
MSE on test data: 21.826


# **Inspect Model Co-efficient and Intercept**

In [None]:
coefficients = lr_model.coefficients
intercept = lr_model.intercept

print("Coefficients: ", coefficients)
print("Intercept: {:.3f}".format(intercept))

Coefficients:  [-0.1136220372940893,0.04890918693405423,0.02379542898673218,2.8017719987351315,-18.41542454118947,3.5158797633119243,0.005211682161471423,-1.416383072353979,0.33176693159370374,-0.013607893704163855,-0.9534143338408103,0.008602677392853194,-0.5195035312476691]
Intercept: 38.617


# **Analyze Feature importance**

In [None]:
feature_importance = sorted(list(zip(data.columns[:-1], map(abs, coefficients))), key=lambda x: x[1], reverse=True)

print("Feature Importance:")
for feature, importance in feature_importance:
    print("  {}: {:.3f}".format(feature, importance))

Feature Importance:
  nox: 18.415
  rm: 3.516
  chas: 2.802
  dis: 1.416
  ptratio: 0.953
  lstat: 0.520
  rad: 0.332
  crim: 0.114
  zn: 0.049
  indus: 0.024
  tax: 0.014
  b: 0.009
  age: 0.005


# **Save and Load the Model**

In [None]:
lr_model.save("lr_model")

# Load the model
from pyspark.ml.regression import LinearRegressionModel
loaded_model = LinearRegressionModel.load("lr_model")