In [0]:
Filepath = "/FileStore/tables/housing_data.csv"
housing_df = spark.read.option("header", "true").csv(Filepath)
display(housing_df)

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
7237550310,20140512T000000,1230000.0,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930
1321400060,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
2008000270,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711
2414600126,20150415T000000,229500.0,3,1.0,1780,7470,1.0,0,0,3,7,1050,730,1960,0,98146,47.5123,-122.337,1780,8113
3793500160,20150312T000000,323000.0,3,2.5,1890,6560,2.0,0,0,3,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570


In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType

stringColumns = [x.name for x in housing_df.schema.fields if x.dataType == StringType()]
doubles_df = housing_df

for c in stringColumns:
  doubles_df = doubles_df.withColumn(c, col(c).cast("double"))

columns = "\n - ".join(stringColumns)
print(f"Columns converted from String to Double:\n - {columns}")

In [0]:
train_df, test_df = doubles_df.randomSplit([.8, .2], seed=42)
print(f"There are {train_df.cache().count()} rows in the training set, and {test_df.cache().count()} in the test set")

In [0]:
(train_repartition_df, test_repartition_df) = (doubles_df
                                           .repartition(24)
                                           .randomSplit([.8, .2], seed=42))

print(train_repartition_df.count())

In [0]:
display(train_df.select("price", "bedrooms").summary())

summary,price,bedrooms
count,17349.0,17349.0
mean,539834.6271831229,3.37241339558476
stddev,366746.4071463259,0.9334198408068192
min,78000.0,0.0
25%,320000.0,3.0
50%,450000.0,3.0
75%,645000.0,4.0
max,7060000.0,33.0


In [0]:
from pyspark.ml.feature import VectorAssembler

vec_assembler = VectorAssembler(inputCols=["bedrooms"], outputCol="features")

vec_train_df = vec_assembler.transform(train_df)

vec_train_df.select("bedrooms", "features", "price").show(10)

In [0]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="price")

lr_model = lr.fit(vec_train_df)

In [0]:
m = round(lr_model.coefficients[0], 2)
b = round(lr_model.intercept, 2)

print(f"The formula for the linear regression line is price = {m}*bedrooms + {b}")

In [0]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[vec_assembler, lr])
pipeline_model = pipeline.fit(train_df)

In [0]:
pred_df = pipeline_model.transform(test_df)

pred_df.select("bedrooms", "features", "price", "prediction").show(10)
