# Boston房价预测

## 载入数据集

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import *

In [2]:
def loadDataSet(route=r'D:\CS\dataset\competition\train.csv'):
    dataSet = pd.read_csv(route)
    return dataSet

In [3]:
dataSet = loadDataSet()

In [4]:
dataSet.head()

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
4,7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9


In [5]:
X = dataSet[dataSet.columns[1:-1]]
y = dataSet['medv']

In [27]:
X.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33
4,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43


13个特征

In [24]:
y.head()

0    24.0
1    21.6
2    33.4
3    36.2
4    22.9
Name: medv, dtype: float64

## 数据预处理

In [8]:
rf = RandomForestRegressor(n_estimators=300, min_samples_split=2, n_jobs=4, oob_score=True, max_depth=20, random_state=10)
rf.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=4,
           oob_score=True, random_state=10, verbose=0, warm_start=False)

In [9]:
rf.feature_importances_

array([0.04016859, 0.00095294, 0.00538455, 0.00136589, 0.01377651,
       0.45478147, 0.01459543, 0.06868596, 0.00722839, 0.01241007,
       0.00984075, 0.01138115, 0.3594283 ])

In [10]:
rf.oob_score_

0.8698696867122027

In [11]:
feature_importance = (rf.feature_importances_ >= 0.01).astype(int)

In [12]:
feature_choosen = [X.columns[i] for i in range(0, len(X.columns)) if feature_importance[i] != 0]
print('these features are userful:', feature_choosen)

these features are userful: ['crim', 'nox', 'rm', 'age', 'dis', 'tax', 'black', 'lstat']


In [22]:
new_features = feature_choosen
new_X = dataSet[new_features]
new_X.head()

Unnamed: 0,crim,nox,rm,age,dis,tax,black,lstat
0,0.00632,0.538,6.575,65.2,4.09,296,396.9,4.98
1,0.02731,0.469,6.421,78.9,4.9671,242,396.9,9.14
2,0.03237,0.458,6.998,45.8,6.0622,222,394.63,2.94
3,0.06905,0.458,7.147,54.2,6.0622,222,396.9,5.33
4,0.08829,0.524,6.012,66.6,5.5605,311,395.6,12.43


以上步骤是去除unuseful features

## Prediction

In [14]:
gbdt = GradientBoostingRegressor(n_estimators=20, max_depth=10, learning_rate=0.4, min_samples_leaf=3)
gbdt.fit(new_X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.4, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=20, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [15]:
testSet = loadDataSet(r'D:\CS\dataset\competition\test.csv')
testSet = testSet[new_features]
pred_y = gbdt.predict(testSet)

In [16]:
pred_y

array([33.30663458, 26.62066749, 15.0523592 , 12.06653239, 16.42421091,
       19.91437808, 20.82492663, 15.4591628 , 15.6170814 , 19.85111142,
       17.16005789, 23.72582023, 17.34758837, 14.01194381, 20.58564585,
       23.40349688, 21.46838356, 30.84453781, 14.1472635 , 23.61924279,
       21.33490491, 23.79787931, 21.13769459, 21.20209709, 20.73144863,
       20.68512034, 23.20964571, 22.38674337, 22.48730439, 27.83994731,
       43.63822628, 43.85781323, 31.0738582 , 20.15565042, 18.8585499 ,
       20.11411096, 17.88701368, 16.43595077, 18.87125175, 20.55028543,
       16.33573622, 20.2850218 , 19.32421422, 17.9490976 , 16.69212653,
       14.31545446, 13.62904966, 16.35758961, 20.95811312, 22.83361673,
       17.75146745, 28.47574168, 49.23194755, 21.53549889, 19.58445492,
       21.8945284 , 31.06472379, 47.25827801, 35.11228501, 33.33026225,
       33.04190966, 44.63785992, 19.45947987, 16.00233533, 17.56494678,
       21.08602756, 25.39286876, 22.04469772, 17.89110833, 21.82

In [17]:
submissionSet = loadDataSet(r'D:\CS\dataset\competition\submission_example.csv')

In [18]:
submissionSet['medv'] = pd.Series(pred_y)

In [19]:
submissionSet

Unnamed: 0,ID,medv
0,3,33.306635
1,6,26.620667
2,8,15.052359
3,9,12.066532
4,10,16.424211
5,18,19.914378
6,20,20.824927
7,25,15.459163
8,26,15.617081
9,27,19.851111


In [20]:
submissionSet.to_csv(r'D:\CS\dataset\competition\result.csv', index=False) 

提交结果的rmse是4.1左右