In [1]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.datasets import fetch_california_housing as fch  # 加利福尼亚房屋价值数据集
import pandas as pd

In [2]:
housevalue = fch()  # 会需要下载，大家可以提前运行试试看

In [3]:
housevalue.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [4]:
housevalue.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [5]:
housevalue.target_names

['MedHouseVal']

In [6]:
X = pd.DataFrame(housevalue.data)  # 放入DataFrame中便于查看

In [7]:
X.shape

(20640, 8)

In [8]:
housevalue.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [9]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [11]:
y = housevalue.target

In [12]:
y.min()

0.14999

In [13]:
y.max()

5.00001

In [14]:
y.shape

(20640,)

MedInc：该街区住户的收入中位数  
HouseAge：该街区房屋使用年代的中位数  
AveRooms：该街区平均的房间数目  
AveBedrms：该街区平均的卧室数目  
Population：街区人口  
AveOccup：平均入住率  
Latitude：街区的纬度  
Longitude：街区的经度

In [15]:
X.columns = housevalue.feature_names  # 特征名字
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [16]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(
    X, y, test_size=0.3, random_state=420)

In [17]:
Xtest.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
5156,1.7656,42.0,4.144703,1.031008,1581.0,4.085271,33.96,-118.28
19714,1.5281,29.0,5.09589,1.09589,1137.0,3.115068,39.29,-121.68
18471,4.175,14.0,5.604699,1.045965,2823.0,2.883555,37.14,-121.64
16156,3.0278,52.0,5.172932,1.085714,1663.0,2.500752,37.78,-122.49
7028,4.5,36.0,4.940447,0.98263,1306.0,3.240695,33.95,-118.09


In [18]:
Xtrain.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
17073,4.1776,35.0,4.425172,1.030683,5380.0,3.368817,37.48,-122.19
16956,5.3261,38.0,6.267516,1.089172,429.0,2.732484,37.53,-122.3
20012,1.9439,26.0,5.768977,1.141914,891.0,2.940594,36.02,-119.08
13072,2.5,22.0,4.916,1.012,733.0,2.932,38.57,-121.31
8457,3.825,34.0,5.036765,1.098039,1134.0,2.779412,33.91,-118.35


In [19]:
# 恢复索引
for i in [Xtrain, Xtest]:
    i.index = range(i.shape[0])

In [20]:
Xtrain.shape

(14448, 8)

In [21]:
# 如果希望进行数据标准化，还记得应该怎么做吗？
# 先用训练集训练(fit)标准化的类，然后用训练好的类分别转化(transform)训练集和测试集

In [22]:
reg = LR().fit(Xtrain, Ytrain)

In [23]:
yhat = reg.predict(Xtest)

In [24]:
yhat.min()

-0.6528439725035966

In [25]:
yhat.max()

7.146198214270861

In [26]:
reg.coef_  # w,系数向量

array([ 4.37358931e-01,  1.02112683e-02, -1.07807216e-01,  6.26433828e-01,
        5.21612535e-07, -3.34850965e-03, -4.13095938e-01, -4.26210954e-01])

In [27]:
[*zip(Xtrain.columns, reg.coef_)]

[('MedInc', 0.43735893059684033),
 ('HouseAge', 0.010211268294493994),
 ('AveRooms', -0.10780721617317697),
 ('AveBedrms', 0.6264338275363783),
 ('Population', 5.216125353261353e-07),
 ('AveOccup', -0.003348509646333585),
 ('Latitude', -0.41309593789477195),
 ('Longitude', -0.42621095362084704)]

MedInc：该街区住户的收入中位数  
HouseAge：该街区房屋使用年代的中位数  
AveRooms：该街区平均的房间数目  
AveBedrms：该街区平均的卧室数目  
Population：街区人口  
AveOccup：平均入住率  
Latitude：街区的纬度  
Longitude：街区的经度

In [28]:
reg.intercept_

-36.256893229203875