In [80]:
# 环境初始化
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"  # 执行全部行输出命令

### 数据准备

In [81]:
# 加载数据
from sklearn.datasets import load_boston
data = load_boston()
print(data['DESCR'])
# data

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [82]:
# 数据集特征探索
df = pd.DataFrame(data['data'],columns=data['feature_names'])
df['target']=data['target']
df.info(); df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  target   506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### 数据预处理
1. 数值型特征与类别型特征分离
2. 缺失值填充
3. 重复值处理
4. 异常值检测
5. 类别型特征编码
6. 数值型特征Normalization
7. 特征衍生与降维（可选）
8. 特征筛选
9. 数据集分割

In [83]:
# 数值型特征与类别型特征分离
df_cat = df[['CHAS','RAD']].astype('int').astype('category')
df_num = df.drop(columns=['CHAS','RAD','target'])
# print(df_cat.shape); print(df_num.shape)


#### 类别型特征编码主要有 ordinal encoding 和 one-hot encoding 两种方法

1. ordinal encoding编码
适用于处理类别间具有大小关系的顺序型类别特征，它按类别大小关系，给其赋予一个从1到n的正整数数值ID，将类别型特征转化成数值型哑变量。

2. one-hot encoding
适用于处理类别间不具有大小关系的分类型类别特征，它按特征值类别数量产生一个n维0-1稀疏向量，每种特征值由向量中对应维度为1、其它维度为0表示。
**使得任意两不同类别的编码向量之差相等，使得模型学习时，可以对每种类别一直同仁。**

3. 网传可以通过二进制编码的方法，转化分类型类别特征，得到与one-hot encoding同样的效果，并且向量维数少于one-hot、节省存储空间。
**这种说法是完全不对的，因为二进制编码与ordinal encoding的本质相同：编码向量间存在大小关系，任意不同类别编码向量间差值不等，模型学习时不能对特征的每种类别一视同仁**

In [84]:
# 类别型特征编码
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
nar_cat = onehot.fit_transform(df_cat).toarray()


In [85]:
# 数值型特征描述性统计分析
df_num.describe()


Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.554695,6.284634,68.574901,3.795043,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.115878,0.702617,28.148861,2.10571,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.385,3.561,2.9,1.1296,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.449,5.8855,45.025,2.100175,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.538,6.2085,77.5,3.20745,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.624,6.6235,94.075,5.188425,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,0.871,8.78,100.0,12.1265,711.0,22.0,396.9,37.97


#### Normalization


In [86]:
# 数值型特征缩放
from sklearn.preprocessing import StandardScaler
standardized = StandardScaler()
nar_num = standardized.fit_transform(df_num)

In [87]:
X = np.concatenate((nar_num, nar_cat), axis=1)
y = data['target'].reshape(len(data['target']),1)

In [88]:
# 数据集分割
def data_split(data, test_ratio=0.2, val_ratio=0, index=0):
    if isinstance(index,int): index = np.random.choice(range(len(data)), size=len(data), replace=False)
    train_index = index[:int(len(data)*(1-val_ratio-test_ratio))]
    val_index = index[int(len(data)*(1-val_ratio-test_ratio)):int(len(data)*(1-test_ratio))]
    test_index = index[int(len(data)*(1-test_ratio)):]
    return data[train_index], data[test_index], data[val_index], index
#
X_train,X_test,_,X_index = data_split(X,test_ratio=0.2)
y_train,y_test,_,y_index = data_split(y,test_ratio=0.2)

In [89]:
# 模型定义
def LinReg_train(X,y,num_epochs,lr):
    num_sample,num_feature = X.shape
    # 初始化
    W = np.random.normal(0,1,(1,num_feature))
    b = 0
    loss=[0]
    # 训练
    for i in range(num_epochs):
        y_hat = X.dot(W.T)+b
        # MSE
        ls = np.sum((y_hat-y)**2)/(2*num_sample)
        # ls = (y_hat-y).dot((y_hat-y).T)/(2*num_sample)
        loss.append(ls)
        if abs(loss[-1]-loss[-2])<=0.001: return loss, W, b    # 已充分训练判定
        # 优化（模型参数迭代）
        W = W-lr*(y_hat-y).T.dot(X)/num_sample
        b = b-lr*np.mean(y_hat-y)
    return loss[1:], W, b

# 预测
def LinReg_price(X,y,W,b):
    num_sample,_ = X.shape
    y_hat = X.dot(W.T)+b
    # MSE
    ls = np.sum((y_hat-y)**2)/(2*num_sample)
    # ls = (y_hat-y).dot((y_hat-y).T)/(2*num_sample)
    return y_hat, ls

# 训练
Num=100
ls=0
loss_=0
for _ in range(Num):
    loss_CV,W,b = LinReg_train(X_train,y_train,1000000,0.01)
    # 模型效果评估
    ls+=loss_CV[-1]
    #预测
    y_hat,loss = LinReg_price(X_test,y_test,W,b)
    loss_+=loss
print(ls/Num, loss_/Num)

40.21987621936163 47.70060686342776


In [90]:
#
col_cat = ['CHAS'+str(i) for i in df_cat.CHAS.drop_duplicates()]+['RAD'+str(i) for i in df_cat.RAD.drop_duplicates()]
col_num = list(df_num.columns)
col = col_num + col_cat + ['target']
df_rul = pd.DataFrame(np.concatenate((X, y), axis=1), columns=col)
# df_rul.corr()['target'].sort_values(ascending=False)

In [91]:
X = np.concatenate((nar_num, df_rul[['RAD24']].values), axis=1)
y = data['target'].reshape(len(data['target']),1)
#
X_train,X_test,_,_ = data_split(X,test_ratio=0.2,index=X_index)
y_train,y_test,_,_ = data_split(y,test_ratio=0.2,index=y_index)
#
ls=0
loss_=0
for _ in range(Num):
    loss_CV,W,b = LinReg_train(X_train,y_train,1000000,0.03)
    # 模型效果评估
    ls+=loss_CV[-1]
    #预测
    y_hat,loss = LinReg_price(X_test,y_test,W,b)
    loss_+=loss
print(ls/Num, loss_/Num)

40.4757241977359 45.72488318492


In [92]:
X = df_rul[['RM','PTRATIO','LSTAT']].values
y = data['target'].reshape(len(data['target']),1)
#
X_train,X_test,_,_ = data_split(X,test_ratio=0.2,index=X_index)
y_train,y_test,_,_ = data_split(y,test_ratio=0.2,index=y_index)
#
ls=0
loss_=0
for _ in range(Num):
    loss_CV,W,b = LinReg_train(X_train,y_train,1000000,0.03)
    # 模型效果评估
    ls+=loss_CV[-1]
    #预测
    y_hat,loss = LinReg_price(X_test,y_test,W,b)
    loss_+=loss
print(ls/Num, loss_/Num)

41.96333020786838 44.49434677548148
