In [59]:
# 环境初始化
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"  # 执行全部行输出命令

### 数据准备

In [60]:
# 加载数据
from sklearn.datasets import load_boston
data = load_boston()
print(data['DESCR'])
# data

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [61]:
# 数据集特征探索
df = pd.DataFrame(data['data'],columns=data['feature_names'])
df['target']=data['target']
df.info(); df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  target   506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### 数据预处理
1. 数值型特征与类别型特征分离
2. 缺失值填充
3. 重复值处理
4. 异常值检测
5. 类别型特征编码
6. 数值型特征Normalization
7. 特征衍生与降维（可选）
8. 特征筛选
9. 数据集分割

In [62]:
# 数值型特征与类别型特征分离
df_cat = df[['CHAS','RAD']].astype('int').astype('category')
df_num = df.drop(columns=['CHAS','RAD','target'])
# print(df_cat.shape); print(df_num.shape)


#### 类别型特征编码主要有 ordinal encoding 和 one-hot encoding 两种方法

1. ordinal encoding编码
适用于处理类别间具有大小关系的顺序型类别特征，它按类别大小关系，给其赋予一个从1到n的正整数数值ID，将类别型特征转化成数值型哑变量。

2. one-hot encoding
适用于处理类别间不具有大小关系的分类型类别特征，它按特征值类别数量产生一个n维0-1稀疏向量，每种特征值由向量中对应维度为1、其它维度为0表示。
**使得任意两不同类别的编码向量之差相等，使得模型学习时，可以对每种类别一直同仁。**

3. 网传可以通过二进制编码的方法，转化分类型类别特征，得到与one-hot encoding同样的效果，并且向量维数少于one-hot、节省存储空间。
**这种说法是完全不对的，因为二进制编码与ordinal encoding的本质相同：编码向量间存在大小关系，任意不同类别编码向量间差值不等，模型学习时不能对特征的每种类别一视同仁**

In [63]:
# 类别型特征编码
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
nar_cat = onehot.fit_transform(df_cat).toarray()


In [64]:
# 数值型特征描述性统计分析
df_num.describe()


Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.554695,6.284634,68.574901,3.795043,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.115878,0.702617,28.148861,2.10571,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.385,3.561,2.9,1.1296,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.449,5.8855,45.025,2.100175,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.538,6.2085,77.5,3.20745,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.624,6.6235,94.075,5.188425,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,0.871,8.78,100.0,12.1265,711.0,22.0,396.9,37.97


#### Normalization
超过

In [65]:
# 数值型特征缩放
from sklearn.preprocessing import StandardScaler
standardized = StandardScaler()
nar_num = standardized.fit_transform(df_num)

In [66]:
X = np.concatenate((nar_num, nar_cat), axis=1)
y = data['target'].reshape(len(data['target']),1)

In [None]:
col_cat = ['CHAS'+str(i) for i in df_cat.CHAS.drop_duplicates()]+['RAD'+str(i) for i in df_cat.RAD.drop_duplicates()]
col_num = list(df_num.columns)
col = col_num + col_cat + ['target']
df_rul = pd.DataFrame(np.concatenate((X, y), axis=1), columns=col)
df_rul

In [32]:
df_rul.corr()['target'].sort_values(ascending=False)

target     1.000000
RM         0.695360
ZN         0.360445
B          0.333461
DIS        0.249929
RAD7       0.190053
RAD4       0.187356
CHAS1      0.175260
RAD3       0.167352
RAD2       0.104444
RAD6       0.092802
RAD1       0.040453
RAD8      -0.039411
RAD5      -0.065711
CHAS0     -0.175260
AGE       -0.376955
CRIM      -0.388305
RAD24     -0.396297
NOX       -0.427321
TAX       -0.468536
INDUS     -0.483725
PTRATIO   -0.507787
LSTAT     -0.737663
Name: target, dtype: float64

In [33]:
df.corr()['target'].sort_values(ascending=False)

target     1.000000
RM         0.695360
ZN         0.360445
B          0.333461
DIS        0.249929
CHAS       0.175260
AGE       -0.376955
RAD       -0.381626
CRIM      -0.388305
NOX       -0.427321
TAX       -0.468536
INDUS     -0.483725
PTRATIO   -0.507787
LSTAT     -0.737663
Name: target, dtype: float64

In [37]:
X = df_rul[['RM','PTRATIO','LSTAT']].values
y = data['target'].reshape(len(data['target']),1)

In [67]:
# 数据集分割
def data_split(data, test_ratio=0.2, val_ratio=0):
    index = np.random.choice(range(len(data)), size=len(data), replace=False)
    train_index = index[:int(len(data)*(1-val_ratio-test_ratio))]
    val_index = index[int(len(data)*(1-val_ratio-test_ratio)):int(len(data)*(1-test_ratio))]
    test_index = index[int(len(data)*(1-test_ratio)):]
    return data[train_index], data[test_index], data[val_index]
#
X_train,X_test,_ = data_split(X,test_ratio=0.2)
y_train,y_test,_ = data_split(y,test_ratio=0.2)

In [68]:
# 模型定义
def LinReg_train(X,y,num_epochs,lr):
    num_sample,num_feature = X.shape
    # 初始化
    loss=[]
    W = np.random.normal(0,1,(1,num_feature))
    b = 0
    # 训练
    for i in range(num_epochs):
        y_hat = X.dot(W.T)+b
        # MSE
        ls = np.sum((y_hat-y)**2)/(2*num_sample)
        # ls = (y_hat-y).dot((y_hat-y).T)/(2*num_sample)
        loss.append(ls)
        # 优化（模型参数迭代）
        W = W-lr*(y_hat-y).T.dot(X)/num_sample
        b = b-lr*np.mean(y_hat-y)
    return loss, W, b

In [69]:
# 训练
loss_CV,W,b = LinReg_train(X_train,y_train,100,0.03)

330.0090598629688 [-0.51408222  0.24164372  0.31652053  0.56755098 -0.36144154 -1.07028281
  0.58239043  1.08848557  0.60347992 -0.72085887  1.5419302  -0.63353448
 -1.13096258  0.3470938  -0.81351526 -0.01127438  0.18206738 -0.23847872
 -0.11190008 -0.49462951  0.37449311  1.10064458] 0.7098941408514212
294.8498968445875 [-0.55460951  0.27057601  0.23434217  0.49768135 -0.27956457 -1.1093607
  0.64211189  1.01299221  0.56473471 -0.66176234  1.46316272 -0.01333568
 -1.08481431  0.38319914 -0.79000929  0.04835894  0.32340857 -0.07047701
 -0.07746153 -0.47785451  0.41658976  1.24509392] 1.3762412139933962
264.1784639187471 [-0.58809097  0.29237013  0.1637848   0.43918174 -0.20510413 -1.13757117
  0.69081859  0.94784719  0.53105305 -0.60878861  1.39436699  0.56761965
 -1.04029706  0.41727186 -0.76847198  0.10422047  0.45511188  0.08688449
 -0.04543764 -0.46254672  0.45634419  1.38294402] 2.001713792355285
237.3753385275482 [-0.61580709  0.30831624  0.10300022  0.39019579 -0.13705295 -1.15

In [70]:
# 模型效果评估
loss_CV

[330.0090598629688,
 294.8498968445875,
 264.1784639187471,
 237.3753385275482,
 213.91986052061782,
 193.3707538588421,
 175.35141807161727,
 159.5385952848973,
 145.65350410769264,
 133.454798623186,
 122.73289608897082,
 113.3053460650403,
 105.01300398325999,
 97.7168356295302,
 91.29522387354316,
 85.6416809281121,
 80.66289236434532,
 76.27703575244904,
 72.41232900929886,
 69.005772608286,
 66.00205664449723,
 63.35260897684051,
 61.01476472801512,
 58.95104062286146,
 57.12850020500662,
 55.51819804750014,
 54.094692778177105,
 52.83562015696586,
 51.72131863081678,
 50.73450079755787,
 49.85996506700294,
 49.084342542349404,
 48.39587477782328,
 47.78421861592323,
 47.240274782541476,
 46.75603733134809,
 46.324461388887705,
 45.93934696617531,
 45.595236877345194,
 45.28732704632844,
 45.01138769307042,
 44.76369407527955,
 44.54096562342894,
 44.34031244857108,
 44.15918832696055,
 43.99534937466931,
 43.846817721213185,
 43.71184957533008,
 43.58890714990313,
 43.4766339778

In [71]:
# 预测
def LinReg_price(X,y,W,b):
    num_sample,_ = X.shape
    y_hat = X.dot(W.T)+b
    # MSE
    ls = np.sum((y_hat-y)**2)/(2*num_sample)
    # ls = (y_hat-y).dot((y_hat-y).T)/(2*num_sample)
    return y_hat, ls
#
y_hat,loss = LinReg_price(X_test,y_test,W,b)

In [72]:
loss


47.01096138138392