## 使用 OLSLinerRegression/GDLinearRegression 预测红酒口感

### 准备数据

In [1]:
import numpy as np

In [2]:
data = np.genfromtxt('dataset/winequality-red.csv', delimiter=';', skip_header=True)
X = data[:, :-1]
X

array([[ 7.4  ,  0.7  ,  0.   , ...,  3.51 ,  0.56 ,  9.4  ],
       [ 7.8  ,  0.88 ,  0.   , ...,  3.2  ,  0.68 ,  9.8  ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  3.26 ,  0.65 ,  9.8  ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  3.42 ,  0.75 , 11.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  3.57 ,  0.71 , 10.2  ],
       [ 6.   ,  0.31 ,  0.47 , ...,  3.39 ,  0.66 , 11.   ]])

In [3]:
y = data[:, -1]
y

array([5., 5., 5., ..., 6., 5., 6.])

### 模型训练与测试

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#### 使用 OLSLinearRegression

In [5]:
from linear_regression import OLSLinearRegression
# 创建模型
ols_lr = OLSLinearRegression()

In [6]:
# 训练模型
ols_lr.train(X_train, y_train)

In [7]:
y_pred = ols_lr.predict(X_test)
print(y_pred)

[5.19949366 5.19200469 5.37299894 5.97961538 5.70461562 5.81195376
 5.91361891 5.02260622 5.4070703  6.21315142 5.88643704 6.65692252
 5.80143157 5.68691102 6.16026974 6.45380337 5.92857293 5.28945339
 5.44424214 5.30478465 6.4061314  5.62666303 5.26706272 5.12015464
 6.01646271 5.38798096 5.68696837 5.05696541 5.93744985 5.23902573
 4.92562889 5.52809647 6.09900935 6.38338581 5.69758526 5.17890552
 5.56650569 6.20181361 5.79605017 5.57790364 5.19793443 5.84428259
 5.90756364 5.28615708 5.30922507 6.21046369 5.35427708 5.31809233
 5.60073092 5.44169969 5.60789975 6.23845527 6.20556585 5.41547264
 5.5228931  5.58753605 5.64351098 6.42287043 4.80708182 4.91860034
 5.25816384 5.82681623 5.31147294 5.23675715 5.27146928 6.42073908
 5.36541979 5.19942465 6.06268461 5.59421755 6.08814459 5.32108176
 5.1563941  5.75002904 4.95691948 5.63671849 5.0292048  5.36568679
 5.47343205 5.58528808 5.54246846 6.4327241  5.2310819  5.80203852
 6.53305061 5.40369685 5.72204852 4.8746533  5.36445471 5.5145

In [8]:
from sklearn.metrics import mean_squared_error

# 以均方误差(MSE)衡量回归模型的性能
mse = mean_squared_error(y_test, y_pred)
print('在测试集上的MSE: {:.4f}'.format(mse))

在测试集上的MSE: 0.4345


In [9]:
y_train_pred = ols_lr.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
print('在训练集上的MSE: {:.4f}'.format(mse_train))

在训练集上的MSE: 0.4114


模型在训练集与测试集性能相差不大,说明未发生过拟合

In [10]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print('在测试集上的MAE: {:.4f}'.format(mae))

在测试集上的MAE: 0.4966


#### 使用 GDLinearRegression

In [11]:
# 创建模型
from linear_regression import GDLinearRegression
gd_lr = GDLinearRegression(n_iter=3000, eta=0.001, tol=0.00001)

In [12]:
# 这样会产生问题
gd_lr.train(X_train, y_train)

0 Loss: 9.412393881658998
   1 Loss: 14.932911320314846


以上输出表明,Loss 不降反升,算法就停止了,这说明步长太长,已经"迈到对面山坡"上了.这时需调小学习率 eta = 0.0001 再试

In [13]:
# 这样其实还是有问题
gd_lr = GDLinearRegression(n_iter=3000, eta=0.0001, tol=0.00001)
gd_lr.train(X_train, y_train)

 0.5301720344661321
2331 Loss: 0.5301511728514866
2332 Loss: 0.5301303205846931
2333 Loss: 0.5301094776613979
2334 Loss: 0.5300886440772495
2335 Loss: 0.5300678198278984
2336 Loss: 0.5300470049089975
2337 Loss: 0.5300261993162008
2338 Loss: 0.5300054030451654
2339 Loss: 0.52998461609155
2340 Loss: 0.5299638384510155
2341 Loss: 0.5299430701192247
2342 Loss: 0.529922311091843
2343 Loss: 0.5299015613645371
2344 Loss: 0.5298808209329762
2345 Loss: 0.5298600897928317
2346 Loss: 0.5298393679397767
2347 Loss: 0.5298186553694868
2348 Loss: 0.529797952077639
2349 Loss: 0.5297772580599133
2350 Loss: 0.5297565733119909
2351 Loss: 0.5297358978295553
2352 Loss: 0.5297152316082924
2353 Loss: 0.52969457464389
2354 Loss: 0.5296739269320376
2355 Loss: 0.5296532884684273
2356 Loss: 0.5296326592487527
2357 Loss: 0.5296120392687099
2358 Loss: 0.5295914285239971
2359 Loss: 0.5295708270103139
2360 Loss: 0.5295502347233626
2361 Loss: 0.5295296516588475
2362 Loss: 0.5295090778124746
2363 Loss: 0.5294885131799

这次虽然损失随着迭代下降了,但是迭代到3000次,算法依然没有收敛,最终损失为0.53+,距离之前用最小二乘法算出来的0.417还差很远,并且后面每次迭代的损失下降非常小.这主要是由于 X 中各个特征尺寸小差较大造成的

In [14]:
# 观察 X 中各特征的均值
X.mean(axis=0)

array([ 8.31963727,  0.52782051,  0.27097561,  2.5388055 ,  0.08746654,
       15.87492183, 46.46779237,  0.99674668,  3.3111132 ,  0.65814884,
       10.42298311])

![image-20200626221932131](http://qn-noter.yunxi.site/imagehost/81iwx.png-style1)

![image-20200626222009284](http://qn-noter.yunxi.site/imagehost/224z6.png-style1)

In [15]:
# 把 X 各特征缩放到相同尺寸,然后重新训练
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
X_train_std = ss.transform(X_train)
X_test_std = ss.transform(X_test)
X_train_std[:3]

array([[ 1.16300518, -0.56109349,  1.18960418, -0.15343992, -0.42920622,
        -0.24552277, -0.81884274, -0.40393429, -1.3585264 ,  1.32787083,
         0.9143143 ],
       [-0.08834366, -1.22357159,  0.67611587, -0.23539661, -0.65555137,
        -0.93017207, -1.09707163, -0.74654696,  0.00621419,  0.15291404,
         0.72814057],
       [-0.71401809,  1.70237334, -1.27513967,  1.56765061,  0.20456018,
        -1.02797912, -1.00432867,  0.13139801,  0.65609066, -0.5273241 ,
         0.54196685]])

In [17]:
X_test_std[:3]

array([[-0.82777707,  0.7638627 , -1.06974435, -0.3173533 , -0.15759205,
        -0.73455799, -0.38604224,  0.90762984,  2.41075713,  0.33843354,
        -0.76124922],
       [-0.20210265, -0.56109349,  0.47072055, -0.15343992, -0.27076462,
        -0.5389439 ,  0.07767258,  0.45259738,  0.20117713, -1.20756224,
        -0.94742294],
       [ 0.30981279, -1.22357159,  1.54904599, -0.39930999,  0.5440779 ,
         1.02596878,  1.22150247,  0.18493124, -0.31872404, -0.77468343,
        -0.66816236]])

In [18]:
# 现在重新训练模型,并且使用已缩放的数据进行训练
gd_lr = GDLinearRegression(n_iter=3000, eta=0.05, tol=0.00001)
gd_lr.train(X_train_std, y_train)

0 Loss: 31.993245084268874
   1 Loss: 28.89944144724334
   2 Loss: 26.11002262672163
   3 Loss: 23.594806091990637
   4 Loss: 21.326648554724795
   5 Loss: 19.28113018678798
   6 Loss: 17.436273814703593
   7 Loss: 15.772294717022172
   8 Loss: 14.271377312722926
   9 Loss: 12.917475565744875
  10 Loss: 11.696134371002017
  11 Loss: 10.594329551910782
  12 Loss: 9.600324404436593
  13 Loss: 8.703540979998284
  14 Loss: 7.894444518465321
  15 Loss: 7.1644396300480295
  16 Loss: 6.505776986646802
  17 Loss: 5.9114694235427505
  18 Loss: 5.375216474634894
  19 Loss: 4.891336471530726
  20 Loss: 4.454705430926804
  21 Loss: 4.060702037714348
  22 Loss: 3.7051581046344144
  23 Loss: 3.384313954357132
  24 Loss: 3.09477822763802
  25 Loss: 2.833491672616632
  26 Loss: 2.597694516141412
  27 Loss: 2.384897058893454
  28 Loss: 2.192853172614025
  29 Loss: 2.0195364104139655
  30 Loss: 1.8631184703911892
  31 Loss: 1.7219497789852907
  32 Loss: 1.5945419839891604
  33 Loss: 1.4795521682100663
 

现在 eta 大幅提高到了0.05, 经过136次迭代后算法收敛,损失约 0.4

In [20]:
# 最后使用训练好的模型对测试集中的实例进行预测,并评估性能
y_pred = gd_lr.predict(X_test_std)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('在测试集上的MSE: {:.4f}'.format(mse))
print('在测试集上的MAE: {:.4f}'.format(mae))

在测试集上的MSE: 0.4336
在测试集上的MAE: 0.4965
