In [29]:
# 环境初始化
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"  # 执行全部行输出命令

### 数据准备

In [30]:
# 加载数据
from sklearn.datasets import load_boston
data = load_boston()
print(data['DESCR'])
# data

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [31]:
# 数据集特征探索
df = pd.DataFrame(data['data'],columns=data['feature_names'])
df.info(); df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


### 数据预处理
1. 数值型特征与类别型特征分离
2. 缺失值填充
3. 异常值检测
4. 类别型特征编码
5. 数值型特征缩放
6. 特征衍生与降维（可选）
7. 特征筛选
8. 数据集分割

In [32]:
# 数值型特征与类别型特征分离
df_cat = df[['CHAS','RAD']].astype('int').astype('category')
df_num = df.drop(columns=['CHAS','RAD'])
# print(df_cat.shape); print(df_num.shape)

In [33]:
# 类别型特征编码
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
nar_cat = onehot.fit_transform(df_cat).toarray()

In [34]:
# 数值型特征描述性统计分析
df_num.describe()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.554695,6.284634,68.574901,3.795043,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.115878,0.702617,28.148861,2.10571,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.385,3.561,2.9,1.1296,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.449,5.8855,45.025,2.100175,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.538,6.2085,77.5,3.20745,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.624,6.6235,94.075,5.188425,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,0.871,8.78,100.0,12.1265,711.0,22.0,396.9,37.97


In [35]:
# 数值型特征缩放
from sklearn.preprocessing import StandardScaler
standardized = StandardScaler()
nar_num = standardized.fit_transform(df_num)


In [36]:
# 数据集分割
def data_split(data, test_ratio=0.2, val_ratio=0):
    index = np.random.choice(range(len(data)), size=len(data), replace=False)
    train_index = index[:int(len(data)*(1-val_ratio-test_ratio))]
    val_index = index[int(len(data)*(1-val_ratio-test_ratio)):int(len(data)*(1-test_ratio))]
    test_index = index[int(len(data)*(1-test_ratio)):]
    return data[train_index], data[test_index], data[val_index]
#
X = np.concatenate((nar_num, nar_cat), axis=1)
y = data['target'].reshape(len(data['target']),1)
X_train,X_test,_ = data_split(X,test_ratio=0.2)
y_train,y_test,_ = data_split(y,test_ratio=0.2)

In [37]:
# 模型定义
def LinReg_train(X,y,num_epochs,lr):
    num_sample,num_feature = X.shape
    # 初始化
    loss=[]
    W = np.random.normal(0,1,(1,num_feature))
    b = 0
    # 训练
    for i in range(num_epochs):
        y_hat = X.dot(W.T)+b
        # MSE
        ls = np.sum((y_hat-y)**2)/(2*num_sample)
        # ls = (y_hat-y).dot((y_hat-y).T)/(2*num_sample)
        loss.append(ls)
        # 优化（模型参数迭代）
        W = W-lr*(y_hat-y).T.dot(X)/num_sample
        b = b-lr*np.mean(y_hat-y)
    return loss, W, b

In [38]:
# 训练
loss_CV,W,b = LinReg_train(X_train,y_train,100,0.03)

In [39]:
# 模型效果评估
loss_CV

[273.50189872122814,
 245.74304378550056,
 221.46443016580707,
 200.2021974356224,
 181.56227028091743,
 165.20761766472776,
 150.84839981062663,
 138.23423238592292,
 127.14802223334314,
 117.40098555881336,
 108.82856878834474,
 101.28706892825808,
 94.65080423892678,
 88.80972428476842,
 83.6673757292186,
 79.13915990301463,
 75.15083247041285,
 71.63720603323472,
 68.54102434587118,
 65.81198273100243,
 63.405873820749875,
 61.28384127672471,
 59.411726929863306,
 57.75949901511875,
 56.30075099155884,
 55.01226193232477,
 53.873610712067226,
 52.86683726417361,
 51.97614506541606,
 51.18763976122282,
 50.48909949341648,
 49.86977305179137,
 49.32020245545566,
 48.832066990813736,
 48.39804609968462,
 48.01169883100003,
 47.66735784918816,
 47.36003623711002,
 47.085345545842856,
 46.83942373160708,
 46.6188717850694,
 46.42069800302349,
 46.242268979567505,
 46.08126650554924,
 45.93564966313962,
 45.803621488588114,
 45.683599651961295,
 45.57419066924406,
 45.474167220704956,
 4

In [40]:
# 预测
def LinReg_price(X,y,W,b):
    _,num_sample = X.shape
    y_hat = X.dot(W.T)+b
    # MSE
    ls = np.sum((y_hat-y)**2)/(2*num_sample)
    # ls = (y_hat-y).dot((y_hat-y).T)/(2*num_sample)
    return y_hat, ls
#
y_hat,loss = LinReg_price(X_test,y_test,W,b)

In [41]:
loss


221.45919394945267