In [1]:
from pyspark.sql import SparkSession

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
from pyspark.ml.feature import VectorAssembler

In [4]:
from pyspark.ml.linalg import Vectors

In [5]:
from pyspark.sql.types import IntegerType

In [6]:
from pyspark.ml.evaluation import RegressionEvaluator

In [7]:
spark = SparkSession.builder.appName("Linear Regression").getOrCreate()

In [8]:
df = spark.read.csv('Home Price Prediction.csv',header = True,inferSchema = True)

In [9]:
df

DataFrame[area: int, bedroom: int, age: int, price: int]

In [10]:
df = df.withColumn("price",df['price'].cast(IntegerType()))

In [11]:
assembler = VectorAssembler(inputCols = ['area','bedroom','age'],outputCol = 'features')

In [12]:
assembled = assembler.transform(df).select('features','price')

In [13]:
train, test = assembled.randomSplit([0.8,0.2],seed = 42)

In [14]:
lr = LinearRegression(featuresCol = 'features',labelCol = 'price')

In [15]:
model = lr.fit(train)

In [16]:
pred = model.transform(test)

In [17]:
pred.select('features','price','prediction').show()

+-----------------+------+-----------------+
|         features| price|       prediction|
+-----------------+------+-----------------+
|[3000.0,4.0,15.0]|565000|603578.0895181594|
|[4500.0,6.0,15.0]|800000|744943.2290567964|
|[8000.0,6.0,45.0]|895000|925147.0770719941|
+-----------------+------+-----------------+



In [18]:
evaluator = RegressionEvaluator(labelCol = 'price',predictionCol = 'prediction',metricName = 'rmse')

In [19]:
rmse = evaluator.evaluate(pred)

In [20]:
rmse

42537.68240649797

In [21]:
r2 = model.summary.r2

In [22]:
r2

0.9316675725045608

In [23]:
spark.stop()