# 案例：波士顿房价预测

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

## 1、获取数据
### 1.1 波士顿房价数据在sklearn中已经内置，可以通过load_boston()方法获得

In [2]:
boston = load_boston()

##### 特征含义
CRIM：城镇人均犯罪率。<br/>
ZN：住宅用地超过 25000 sq.ft. 的比例。<br/>
INDUS：城镇非零售商用土地的比例。<br/>
CHAS：查理斯河空变量（如果边界是河流，则为1；否则为0）。<br/>
NOX：一氧化氮浓度。<br/>
RM：住宅平均房间数。<br/>
AGE：1940 年之前建成的自用房屋比例。<br/>
DIS：到波士顿五个中心区域的加权距离。<br/>
RAD：辐射性公路的接近指数。<br/>
TAX：每 10000 美元的全值财产税率。<br/>
PTRATIO：城镇师生比例。<br/>
B：1000（Bk-0.63）^ 2，其中 Bk 指代城镇中黑人的比例。<br/>
LSTAT：人口中地位低下者的比例。<br/>
MEDV：自住房的平均房价，以千美元计。<br/>

In [3]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
print(boston)

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]]), 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 1

##### 取特征X和标签y

In [5]:
X = boston.data
y = boston.target

### 1.2 从文件读取
绝大多数情况，数据是存在文件中的，如excel。所以我们也可以从文件中读取数据。一般使用pandas读取。

In [6]:
import pandas as pd

In [7]:
df = pd.read_excel('data2/boston.xls')
df

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
5,5,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43,22.9
7,7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15,27.1
8,8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.10,18.9


In [8]:
X = df[df.columns[0:-1]]
y = df[df.columns[-1]]


## 2、数据预处理(数据清洗)


我们获取的数据有可能存在下面的一些情况：
  - 缺少数据值
  - 含有错误的数据值，如年龄=200
  - 数据不一致，等级编码有的是“1，2，3”有的却是“A，B，C ”
  - 重复的记录值
  
$\color{red}{注意：本门课程关注的是机器学习算法，而波士顿房价数据也是清理过得，所以该部分不用写代码进行处理}$

## 3、数据分析与可视化

$\color{red}{注意：本门课程关注的是机器学习算法，不是数据分析，因此忽略数据分析与可视化部分}$

## 4、选择合适的机器学习模型

该问题是房价预测问题，线性回归能很好的应用于预测问题，因此我们选择使用线性回归模型

In [11]:
model = linear_model.Ridge(alpha=0.1)
model.fit(X,y)
y_hat = model.predict(X)
y_hat

array([30.46316899, 25.29093644, 30.93662342, 29.051484  , 28.35468228,
       25.62154564, 23.41451671, 19.82734417, 11.748109  , 19.23990707,
       19.29269448, 21.92428678, 21.3898745 , 20.04738935, 19.69266126,
       19.8013982 , 21.13480473, 17.30846756, 16.713949  , 18.84252579,
       12.82132582, 18.03321356, 16.17772727, 14.10267919, 16.00647381,
       13.72037107, 15.79807424, 15.04108579, 19.89430115, 21.25499443,
       11.7401512 , 18.3609943 ,  9.11770364, 14.56846924, 13.98156473,
       24.15740395, 22.69663086, 23.54087565, 23.38581911, 31.88295108,
       34.78193404, 28.46558489, 25.60745714, 25.00960007, 23.20646856,
       22.36718993, 20.68648627, 18.10426604,  9.08081316, 17.34364633,
       21.53160525, 24.17356035, 28.03200117, 24.39379572, 15.85841779,
       31.66555986, 25.28148374, 33.53710379, 22.255615  , 21.48578272,
       18.18410125, 18.72624581, 24.33227609, 22.98688692, 23.57117262,
       30.82377651, 25.90858955, 21.46544562, 17.69163341, 21.08

我们如何选择参数alpha呢？

## 5、训练模型(使用交叉验证选择合适的参数)


In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [12]:
from sklearn.model_selection import GridSearchCV


In [16]:
ridge_model = linear_model.Ridge()
param = {'alpha':[0.01,0.03,0.05,0.07,0.1,0.5,0.8,1],'normalize':[True,False]}
gsearch = GridSearchCV(estimator=ridge_model,param_grid=param,cv=5,scoring='neg_mean_squared_error')
gsearch.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.01, 0.03, 0.05, 0.07, 0.1, 0.5, 0.8, 1],
                         'normalize': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [17]:
gsearch.best_params_,gsearch.best_score_

({'alpha': 0.03, 'normalize': True}, -25.894320894991033)

## 6、模型评价

In [18]:
final_model = linear_model.Ridge(alpha=0.01,normalize=True)
final_model.fit(X_train,y_train)

y_train_hat = final_model.predict(X_train)
y_test_hat = final_model.predict(X_test)

print("train-MSE=",mean_squared_error(y_train,y_train_hat))
print("test-MSE=",mean_squared_error(y_test,y_test_hat))

train-MSE= 21.682888410980553
test-MSE= 23.286464870147363


## 7、上线部署使用

1、模型保存

In [19]:
from sklearn.externals import joblib
joblib.dump(final_model,"house_train_model.m")



['house_train_model.m']

2、模型读取

In [20]:
load_model = joblib.load("house_train_model.m")

In [21]:
load_model.predict(X_test)

array([17.52591785, 24.77180856, 24.05330998,  8.56655037, 19.73574177,
       27.27240349, 36.51322446, 19.29249287, 18.41893337, 27.54979965,
       19.89892807, 23.7600317 , 35.37481278, 12.77130082, 19.57071772,
       18.73617067, 19.88966673, 12.20714162, 14.08501097, 19.70992642,
       21.97399013, 20.79108294, 31.31349186, 30.82574396, 24.31953877,
       19.15319475, 16.14222207, 16.13309618, 22.51208472, 36.30521567,
       24.74242609, 20.64004609, 21.35423697, 16.47259242, 21.86607612,
       23.79115392, 41.29850616, 27.14631047, 20.44986781, 33.55782146,
        8.84609479, 36.25302753, 19.92071434, 26.70707877, 17.45470213,
       21.57669991, 16.72062086, 19.72367981, 18.78546893, 19.01682191,
       19.96302641, 19.27201499, 17.23271772,  9.79470077, 22.09585329,
       21.31120412, 26.21262029, 26.05911971, 22.94754087, 35.88426933,
       16.37033041, 35.82997792, 20.02046467, 12.45788643, 28.73843532,
       30.18096673, 24.40934556, 21.03326934, 18.93939831, 22.92