<a href="https://colab.research.google.com/github/KD-2528/Hand_Written_Digit_Prediction_Classification/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **House Price Prediction Regression Problem**

In [3]:
# install
!pip install Pyspark



In [4]:
# start spark session
from pyspark.sql import SparkSession

###This code creates a SparkSession object using the builder pattern. The builder method returns a SparkSession.Builder object, and getOrCreate() gets an existing SparkSession or creates a new one if it doesn't exist. This is the standard way to start a Spark application.###

In [5]:
spark=SparkSession.builder.getOrCreate()

In [6]:
spark

In [7]:
# read external data with url
import pandas as pd
house = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Boston.csv')
house=spark.createDataFrame(house)

In [8]:
house.show()

+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|              CRIM|  ZN|INDUS|CHAS|   NX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|
+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|           0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296.0|   15.3| 396.9| 4.98|24.0|
|           0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242.0|   17.8| 396.9| 9.14|21.6|
|           0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242.0|   17.8|392.83| 4.03|34.7|
|0.0323699999999999| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222.0|   18.7|394.63| 2.94|33.4|
|           0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222.0|   18.7| 396.9| 5.33|36.2|
|           0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222.0|   18.7|394.12| 5.21|28.7|
|           0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311.0|   15.2| 395.6|12.43|22.9|
|           0.14455|

In [9]:
house.printSchema()

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: long (nullable = true)
 |-- NX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: long (nullable = true)
 |-- TAX: double (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)



In [10]:
house.columns

['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT',
 'MEDV']

##This code snippet imports the VectorAssembler class from the pyspark.ml.feature module. VectorAssembler is a transformer that combines a given list of columns into a single vector column. It's commonly used to prepare feature columns for machine learning models in PySpark.##


In [11]:
# create vector assembler
from pyspark.ml.feature import VectorAssembler

In [12]:
featureassembler= VectorAssembler(inputCols=['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT'], outputCol='Features')

In [13]:
output = featureassembler.transform(house)

In [14]:
output.show()

+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+--------------------+
|              CRIM|  ZN|INDUS|CHAS|   NX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|            Features|
+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+--------------------+
|           0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296.0|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|           0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242.0|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|           0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242.0|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.0323699999999999| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222.0|   18.7|394.63| 2.94|33.4|[0.03236999999999...|
|           0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222.0|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|           0.02985| 0.0| 2.18|   0|0.458| 6.43|

In [15]:
# create model data
modeldata=output.select('Features','Medv')

In [16]:
modeldata.show()

+--------------------+----+
|            Features|Medv|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03236999999999...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.63796000000000...|18.2|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
+--------------------+----+
only showing top 20 rows



In [17]:
# split data
train_data,test_data=modeldata.randomSplit([0.8,0.2])

In [18]:
train_data.show()

+--------------------+----+
|            Features|Medv|
+--------------------+----+
|[0.01311,90.0,1.2...|35.4|
|[0.0136,75.0,4.0,...|18.9|
|[0.01381,80.0,0.4...|50.0|
|[0.01439,60.0,2.9...|29.1|
|[0.01778,95.0,1.4...|32.9|
|[0.01951,17.5,1.3...|33.0|
|[0.02009,95.0,2.6...|50.0|
|[0.02055,85.0,0.7...|24.7|
|[0.02176999999999...|42.3|
|[0.02187,60.0,2.9...|31.1|
|[0.02729,0.0,7.07...|34.7|
|[0.02731,0.0,7.07...|21.6|
|[0.02875,28.0,15....|25.0|
|[0.02985,0.0,2.18...|28.7|
|[0.03236999999999...|33.4|
|[0.03358999999999...|34.9|
|[0.03445,82.5,2.0...|24.1|
|[0.0351,95.0,2.68...|48.5|
|[0.03584,80.0,3.3...|23.5|
|[0.03659,25.0,4.8...|24.8|
+--------------------+----+
only showing top 20 rows



##This code imports the LinearRegression class from the pyspark.ml.regression module. This class is used to train a linear regression model in PySpark.##



> Add blockquote



In [19]:
# regression model
from pyspark.ml.regression import LinearRegression

In [20]:
reg = LinearRegression(featuresCol='Features', labelCol='Medv')

In [21]:
reg=reg.fit(train_data)

In [22]:
reg.coefficients

DenseVector([-0.1267, 0.0446, 0.007, 2.7337, -18.0392, 3.7216, -0.0001, -1.5931, 0.3782, -0.0143, -0.941, 0.0105, -0.5473])

In [23]:
reg.intercept

37.687633765344415

In [24]:
# prediction
y_pred=reg.evaluate(test_data)

In [25]:
y_pred.predictions.show()

+--------------------+----+------------------+
|            Features|Medv|        prediction|
+--------------------+----+------------------+
|[0.00632,18.0,2.3...|24.0|29.941740141476313|
|[0.01432,100.0,1....|31.6| 32.83334932570446|
|[0.02763,75.0,2.9...|30.8|31.372918458319308|
|[0.0315,95.0,1.47...|34.9|29.493962360509002|
|[0.03551,25.0,4.8...|22.9|25.101073532878395|
|[0.03768,80.0,1.5...|34.6| 34.09373980550084|
|[0.04294,28.0,15....|20.6|27.057360896821635|
|[0.04462,25.0,4.8...|23.9|26.991003956592586|
|[0.06642,0.0,4.05...|29.9|31.747744577381788|
|[0.06899,0.0,25.6...|22.0| 21.93886934147379|
|[0.08014,0.0,5.96...|21.0|23.421822649789704|
|[0.09068,45.0,3.4...|37.0| 30.44438204106371|
|[0.09252,30.0,4.9...|23.3|28.406363485983064|
|[0.10153,0.0,12.8...|20.0| 22.76553269350002|
|[0.10328,25.0,5.1...|19.6|21.215752835911882|
|[0.10612,30.0,4.9...|20.1|23.628586579698805|
|[0.11432,0.0,8.56...|26.5|25.724283664874537|
|[0.12329,0.0,10.0...|18.8|20.743222474173987|
|[0.1265,25.0

In [26]:
y_pred.meanAbsoluteError

3.288595311367124

In [27]:
y_pred.r2

0.7748418098740775

##This code snippet spark.stop() is used to terminate the SparkSession. It's good practice to stop the Spark session when you are finished with your work to release resources.##

In [28]:
# close connection to spark only at the end
spark.stop()