# <center>Housing Price Prediction</center> 

In [1]:
from pyspark.sql import SparkSession  # 建立spark 之间的连接

In [2]:
spark = SparkSession.builder.appName("LinearRegressionQuiz").getOrCreate()

In [3]:
#读入数据
data = spark.read.csv('./data/boston_housing.csv', header=True, inferSchema=True)

In [4]:
data.count()

506

In [5]:
#返回的是一个列， 都有是么样的列
data.columns

['crim',
 'zn',
 'indus',
 'chas',
 'nox',
 'rm',
 'age',
 'dis',
 'rad',
 'tax',
 'ptratio',
 'b',
 'lstat',
 'medv']

In [6]:
len(data.columns)  # 求有多少列，列表的长度

14

In [7]:
data.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2

* CRIM: per capita crime rate by town
* ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
* INDUS: proportion of non-retail business acres per town.
* CHAS: Charles River dummy variable (1 if tract bounds river; 0 otherwise)
* NOX: nitric oxides concentration (parts per 10 million)
* RM: average number of rooms per dwelling
* AGE: proportion of owner-occupied units built prior to 1940
* DIS: weighted distances to five Boston employment centres
* RAD: index of accessibility to radial highways
* TAX: full-value property-tax rate per $10,000
* PTRATIO: pupil-teacher ratio by town
* B: 1000(Bk — 0.63)² where Bk is the proportion of blacks by town
* LSTAT: lower status of the population
* MEDV: Median value of owner-occupied homes in thousand dollers

In [43]:
data.printSchema()  # 查看每一列数据的  格式

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [9]:
#feature data
feature_columns = data.columns[:-1]  #  feature 包含的数据  把第0列到第13列全部拿出来给 feature
feature_columns

['crim',
 'zn',
 'indus',
 'chas',
 'nox',
 'rm',
 'age',
 'dis',
 'rad',
 'tax',
 'ptratio',
 'b',
 'lstat']

In [10]:
# 把这些东西全部放到一个向量里面，做vector的计算，
from pyspark.ml.feature import VectorAssembler
            #这个向量数据输入inputCols=feature_columns  数据输出为 outputCol="features" 生成的新的column 叫做feature.
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")  # 生成的 outputCol

In [11]:
data2 = assembler.transform(data)   # assembler做在data 2 里面做transform,产生的新的数据放进data2,
                                    # 也就比原来多了feature 一列，计算机开始计算时就只看 feature 这一列

In [14]:
data2.show(20,False)

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+----------------------------------------------------------------------------+
|crim   |zn  |indus|chas|nox  |rm   |age  |dis   |rad|tax|ptratio|b     |lstat|medv|features                                                                    |
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+----------------------------------------------------------------------------+
|0.00632|18.0|2.31 |0   |0.538|6.575|65.2 |4.09  |1  |296|15.3   |396.9 |4.98 |24.0|[0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98]     |
|0.02731|0.0 |7.07 |0   |0.469|6.421|78.9 |4.9671|2  |242|17.8   |396.9 |9.14 |21.6|[0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14]    |
|0.02729|0.0 |7.07 |0   |0.469|7.185|61.1 |4.9671|2  |242|17.8   |392.83|4.03 |34.7|[0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03]   |
|0.03237|0.0 |2.18 |0   |0.4

In [12]:
data2.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|            features|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|[0.02985,0.0,2.18...|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5

In [15]:
#套路，把数据分成 训练数据和测试数据  固定写法

train, test = data2.randomSplit([0.8, 0.2])

In [16]:
#把LinearRegression这个包调入，有了这个包就可以做
from pyspark.ml.regression import LinearRegression

In [17]:
# train the 1st model, regular linear regression
# 线性回归   LinearRegression是一个类，然后会实例化，变成一个对象lr
lr = LinearRegression(featuresCol="features", labelCol="medv", standardization=False)  # featuresCol="features", labelCol="medv"直接进行赋值

#standardization=False  false 是不做标准化，标准化 (x - min) / max - min

model = lr.fit(train)
model.coefficients  #  coefficients 回归系数  对应的是 beta1..... beta 13

DenseVector([-0.1158, 0.0515, 0.0217, 2.1177, -18.2743, 3.3538, 0.0111, -1.4584, 0.3654, -0.0156, -0.9725, 0.0085, -0.5612])

In [18]:
# train the 2nd model, lasso regression, add L1 regularization 
# 做拉锁回归  regParam=0.5 = lambder   elasticNetParam = 1.0 会让不重要的项的系数变成0  
#regParam=0.5 这个系数越大会减少更多的系数，系数变成0的项也越多
#当 elasticNetParam越大的时候会选择出更加重要的项

lr_lasso = LinearRegression(featuresCol="features", labelCol="medv", regParam=0.5, elasticNetParam = 1.0, standardization = True)

model_lasso = lr_lasso.fit(train)
model_lasso.coefficients

DenseVector([-0.0244, 0.0, 0.0, 1.0154, 0.0, 3.8834, 0.0, -0.1025, 0.0, 0.0, -0.7267, 0.0055, -0.5385])

In [19]:
# train the 3rd model, ridge regression, add L2 regularization
# 做 岭回归     系数会变小
lr_ridge = LinearRegression(featuresCol="features", labelCol="medv", regParam=0.2, elasticNetParam = 0.0)
model_ridge = lr_ridge.fit(train)
model_ridge.coefficients

DenseVector([-0.1063, 0.0449, -0.0111, 2.2711, -15.8437, 3.4972, 0.0076, -1.3327, 0.2741, -0.0112, -0.9263, 0.0085, -0.5387])

In [20]:
# train the 4th model, ridge regression, add L2 regularization 
# 弹性网络回归  上面两种回归的结合  另外一个 惩罚项是   1- 0.8  百分之80来自拉锁回归 百分之二十来自岭回归

lr_en = LinearRegression(featuresCol="features", labelCol="medv", regParam=0.3, elasticNetParam = 0.8)

model_en = lr_en.fit(train)
model_en.coefficients  # 回归系数

DenseVector([-0.0477, 0.0158, -0.0058, 1.8136, -7.5511, 3.8744, 0.0, -0.7248, 0.0, 0.0, -0.7941, 0.0067, -0.5389])

In [21]:
#经过上面的步骤，我们产生了4个model  线性回归  拉锁回归  岭回归  弹性网络，然后接下来就是做预测
#用第一个线性回归做的 预测
predictions = model.transform(test)
predictions.select(predictions.columns[13:]).show()

+----+--------------------+------------------+
|medv|            features|        prediction|
+----+--------------------+------------------+
|22.0|[0.01096,55.0,2.2...| 27.35239904506532|
|31.6|[0.01432,100.0,1....| 33.80514327629332|
|29.1|[0.01439,60.0,2.9...|31.615690041667715|
|23.1|[0.0187,85.0,4.15...| 25.37913735427707|
|26.6|[0.02899,40.0,1.2...| 21.76488086848516|
|34.9|[0.03359,75.0,2.9...|34.276592235090874|
|48.5|[0.0351,95.0,2.68...| 42.15704698457762|
|20.9|[0.03548,80.0,3.6...|21.871042589963647|
|45.4|[0.03578,20.0,3.3...|38.954496051877314|
|35.4|[0.03705,20.0,3.3...| 34.56717147035438|
|34.6|[0.03768,80.0,1.5...| 34.58751233899849|
|22.0|[0.03932,0.0,3.41...| 27.58054795279234|
|22.9|[0.04203,28.0,15....|29.168051832382652|
|20.6|[0.04294,28.0,15....|27.510669453486322|
|24.8|[0.04297,52.5,5.3...| 27.01490743756918|
|20.6|[0.04527,0.0,11.9...| 22.48842682268969|
|23.3|[0.0456,0.0,13.89...|26.225478784411095|
|30.3|[0.04666,80.0,1.5...| 32.57041466397976|
|24.6|[0.0542

In [22]:
#拉锁回归做预测
predictions = model_lasso.transform(test)
predictions.select(predictions.columns[13:]).show()

+----+--------------------+------------------+
|medv|            features|        prediction|
+----+--------------------+------------------+
|22.0|[0.01096,55.0,2.2...| 27.63692099792868|
|31.6|[0.01432,100.0,1....|31.382164376805488|
|29.1|[0.01439,60.0,2.9...|30.090394399013505|
|23.1|[0.0187,85.0,4.15...| 26.86032583011624|
|26.6|[0.02899,40.0,1.2...| 27.40715500735677|
|34.9|[0.03359,75.0,2.9...|31.239480230359938|
|48.5|[0.0351,95.0,2.68...| 36.10288003901449|
|20.9|[0.03548,80.0,3.6...|23.853399689344744|
|45.4|[0.03578,20.0,3.3...| 35.86949717123409|
|35.4|[0.03705,20.0,3.3...|32.084676349001995|
|34.6|[0.03768,80.0,1.5...|33.639519246069604|
|22.0|[0.03932,0.0,3.41...| 26.07486835661335|
|22.9|[0.04203,28.0,15....| 25.89856930129013|
|20.6|[0.04294,28.0,15....|23.856209406287366|
|24.8|[0.04297,52.5,5.3...|26.309122555912595|
|20.6|[0.04527,0.0,11.9...|22.269577501895647|
|23.3|[0.0456,0.0,13.89...| 23.23435077536154|
|30.3|[0.04666,80.0,1.5...| 31.71007180354019|
|24.6|[0.0542

In [23]:
#岭回归做预测
predictions = model_ridge.transform(test)
predictions.select(predictions.columns[13:]).show()

+----+--------------------+------------------+
|medv|            features|        prediction|
+----+--------------------+------------------+
|22.0|[0.01096,55.0,2.2...| 27.55344727766984|
|31.6|[0.01432,100.0,1....| 33.27969685565337|
|29.1|[0.01439,60.0,2.9...|31.493072926895014|
|23.1|[0.0187,85.0,4.15...|25.627780904715117|
|26.6|[0.02899,40.0,1.2...|22.747169398142596|
|34.9|[0.03359,75.0,2.9...|33.919672335310636|
|48.5|[0.0351,95.0,2.68...|41.331025713102676|
|20.9|[0.03548,80.0,3.6...| 22.21347247936422|
|45.4|[0.03578,20.0,3.3...| 38.37637779675731|
|35.4|[0.03705,20.0,3.3...|34.050273952782184|
|34.6|[0.03768,80.0,1.5...|  34.6579302328578|
|22.0|[0.03932,0.0,3.41...|27.553115572550904|
|22.9|[0.04203,28.0,15....| 28.49924634298668|
|20.6|[0.04294,28.0,15....|26.779653270011156|
|24.8|[0.04297,52.5,5.3...|26.820482097953583|
|20.6|[0.04527,0.0,11.9...|  22.5065079234501|
|23.3|[0.0456,0.0,13.89...|25.955084252293574|
|30.3|[0.04666,80.0,1.5...| 32.66684913351512|
|24.6|[0.0542

In [24]:
#弹性网络做预测
predictions = model_en.transform(test)
predictions.select(predictions.columns[13:]).show()

+----+--------------------+------------------+
|medv|            features|        prediction|
+----+--------------------+------------------+
|22.0|[0.01096,55.0,2.2...| 27.77766203024618|
|31.6|[0.01432,100.0,1....|31.450831111168068|
|29.1|[0.01439,60.0,2.9...|30.850682500325465|
|23.1|[0.0187,85.0,4.15...|26.220701485929634|
|26.6|[0.02899,40.0,1.2...|25.784445776174042|
|34.9|[0.03359,75.0,2.9...| 32.38043124145209|
|48.5|[0.0351,95.0,2.68...|38.058604174907934|
|20.9|[0.03548,80.0,3.6...| 23.09917595715976|
|45.4|[0.03578,20.0,3.3...| 36.67607011421642|
|35.4|[0.03705,20.0,3.3...| 32.56219045765232|
|34.6|[0.03768,80.0,1.5...| 34.23725204902557|
|22.0|[0.03932,0.0,3.41...|27.036515999790474|
|22.9|[0.04203,28.0,15....| 27.04189133537175|
|20.6|[0.04294,28.0,15....|25.034124303507504|
|24.8|[0.04297,52.5,5.3...|26.147694804771074|
|20.6|[0.04527,0.0,11.9...|22.838716246799535|
|23.3|[0.0456,0.0,13.89...| 24.55579326645227|
|30.3|[0.04666,80.0,1.5...| 32.26215293496807|
|24.6|[0.0542

In [26]:
#分别对上面4个model 做evaluate
evaluation_summary_regular = model.evaluate(test)          #  12全部评价一下 包括平方和，方差等等
evaluation_summary_lasso   = model_lasso.evaluate(test)
evaluation_summary_ridge   = model_ridge.evaluate(test)
evaluation_summary_en      = model_en.evaluate(test)

In [27]:
evaluation_summary_regular.rootMeanSquaredError  #平方和 然后开根号

4.90381184751194

In [35]:
evaluation_summary_regular.r2  # r  的值越接近 1 越好

0.7429876743659836

In [36]:
evaluation_summary_lasso.rootMeanSquaredError

5.188575457367097

In [37]:
evaluation_summary_lasso.r2

0.7122716677342387

In [38]:
evaluation_summary_ridge.rootMeanSquaredError

4.848226775853608

In [39]:
evaluation_summary_ridge.r2

0.748781160033186

In [40]:
evaluation_summary_en.rootMeanSquaredError

4.933135217852435

In [41]:
evaluation_summary_en.r2

0.7399047662996867

In [42]:
# 交叉验证

<hr/>

In [43]:
import numpy as np

# dot product. sum of squares - regular linear regression
np.dot(model.coefficients, model.coefficients)

353.21922489990465

In [44]:
# dot product. sum of squares - ridge regression
np.dot(model_ridge.coefficients, model_ridge.coefficients)

271.42570957837313

In [45]:
# sum of absolute value - lasso regression

sum = 0.0
for b in model_lasso.coefficients:
    sum += abs(b)
print(sum)

6.296435468609746
