In [2]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler 
print(np.__version__)

1.24.2


维基百科定义: 梯度下降是一种用于寻找可微函数局部最小值的一阶迭代优化算法。  
我们的定义: 梯度下降是机器学习和深度学习中常用的一种迭代技术。它从初始(可能是随机或猜测的)数据开始,为给定模型、数据点和损失函数,找到可能的最佳参数/系数。
# 可视化
## 模型
从最简单的模型入手 -- 简单线性回归  
$ y = b + wx + \epsilon $  
在这模型里，我们用feature(x)来predict label(y). b参数为bias(intercept), w参数为weight(slope), $\epsilon$为noise(error)  
我们可以很快构造出这样的一个模型结构，比如  
    工资 = 底薪 + 年增长 * 工作年限 + noise  
## 数据生成
取b=1,w=2,用np.random生成随机误差$\epsilon$

In [None]:
true_b = 1
true_w = 2
N = 100

np.random.seed(42)
x = np.random.rand(N,1)
epsilon = .1 * np.random.randn(N,1)
y = true_b + true_w * x + epsilon
print(x,epsilon,y)


接下来八二开成训练数据和验证(测试)数据集，并打乱。

In [17]:
idx = np.arange(N)
np.random.shuffle(idx)
train_idx = idx[:int(N*.8)]
val_idx = idx[int(N*.8):]

# train_idx = np.array([0,2,4])
x_train,y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]
x_train


array([[0.29122914],
       [0.54671028],
       [0.66252228],
       [0.73199394],
       [0.60111501],
       [0.37454012],
       [0.5612772 ],
       [0.0884925 ],
       [0.06355835],
       [0.51423444],
       [0.18485446],
       [0.63755747],
       [0.80219698],
       [0.72900717],
       [0.18340451],
       [0.07455064],
       [0.12203823],
       [0.28093451],
       [0.61185289],
       [0.4937956 ],
       [0.02058449],
       [0.11586906],
       [0.72960618],
       [0.77127035],
       [0.43194502],
       [0.25877998],
       [0.92187424],
       [0.96990985],
       [0.38867729],
       [0.10789143],
       [0.94888554],
       [0.03438852],
       [0.86617615],
       [0.81546143],
       [0.76078505],
       [0.36636184],
       [0.89482735],
       [0.60754485],
       [0.83244264],
       [0.19598286],
       [0.52006802],
       [0.9093204 ],
       [0.52273283],
       [0.70807258],
       [0.95071431],
       [0.13949386],
       [0.71324479],
       [0.706

## Step0 - 随机初始化
现实生活中，b,w这些参数的真实值是永远不会知道的。(如果知道了，我们就不需要大费周章地训练模型了) 为了训练模型，我们需要随即初始化parameters/weights(b/w). 

In [16]:
b = np.random.randn(1)
w = np.random.randn(1)
print(b,w)

[0.76284729] [0.6379667]


## Step1 - 计算Prediction
一开始，由于b,w随机生成，我们会得到bad prediction

In [18]:
yhat = b + w * x_train
yhat

array([[0.94864178],
       [1.11163024],
       [1.18551445],
       [1.22983505],
       [1.14633865],
       [1.00179141],
       [1.12092345],
       [0.81930256],
       [0.8033954 ],
       [1.09091174],
       [0.88077828],
       [1.16958773],
       [1.27462225],
       [1.22792959],
       [0.87985326],
       [0.81040812],
       [0.84070362],
       [0.94207415],
       [1.15318906],
       [1.07787244],
       [0.77597951],
       [0.83676789],
       [1.22831174],
       [1.25489209],
       [1.03841383],
       [0.9279403 ],
       [1.35097235],
       [1.38161748],
       [1.01081046],
       [0.83167843],
       [1.36820466],
       [0.78478602],
       [1.31543883],
       [1.28308453],
       [1.24820282],
       [0.99657395],
       [1.33371734],
       [1.15044067],
       [1.29391797],
       [0.88787783],
       [1.09463337],
       [1.34296343],
       [1.09633343],
       [1.21457402],
       [1.36937136],
       [0.85183973],
       [1.21787371],
       [1.213

## Step2 - 计算Loss
loss和error不一样，error用于计算单个点的误差(真实值(label)和预测值的差异)，比如第i个点的error  
$ error_i = \hat y_i - y_i $  
loss,则是这些误差的聚合。
对于回归问题，loss=mean squared error(MSE)  
$MSE=\frac{1}{n}\sum\limits_{i=1}^nerror_i^2=\frac{1}{n}\sum\limits_{i=1}^n(\hat y_i-y_i)^2=\frac{1}{n}\sum\limits_{i=1}^n(b+wx_i-y_i)^2$ 

In [20]:
error = (yhat - y_train)
loss = (error ** 2).mean()
error
print(loss)

0.9712093272791501


In [22]:
b_range = np.linspace(true_b-3, true_b+3, 101)
w_range = np.linspace(true_w-3, true_w+3, 101)
bs,ws = np.meshgrid(b_range, w_range)
ws

array([[-1.  , -1.  , -1.  , ..., -1.  , -1.  , -1.  ],
       [-0.94, -0.94, -0.94, ..., -0.94, -0.94, -0.94],
       [-0.88, -0.88, -0.88, ..., -0.88, -0.88, -0.88],
       ...,
       [ 4.88,  4.88,  4.88, ...,  4.88,  4.88,  4.88],
       [ 4.94,  4.94,  4.94, ...,  4.94,  4.94,  4.94],
       [ 5.  ,  5.  ,  5.  , ...,  5.  ,  5.  ,  5.  ]])

In [25]:
dummy_x = x_train[0]
dummy_yhat = bs + ws * dummy_x

all_predictions = np.apply_along_axis(func1d=lambda x:bs+ws*x, axis=1, arr=x_train)
all_predictions

array([[[-2.29122914, -2.23122914, -2.17122914, ...,  3.58877086,
          3.64877086,  3.70877086],
        [-2.27375539, -2.21375539, -2.15375539, ...,  3.60624461,
          3.66624461,  3.72624461],
        [-2.25628164, -2.19628164, -2.13628164, ...,  3.62371836,
          3.68371836,  3.74371836],
        ...,
        [-0.5788018 , -0.5188018 , -0.4588018 , ...,  5.3011982 ,
          5.3611982 ,  5.4211982 ],
        [-0.56132805, -0.50132805, -0.44132805, ...,  5.31867195,
          5.37867195,  5.43867195],
        [-0.5438543 , -0.4838543 , -0.4238543 , ...,  5.3361457 ,
          5.3961457 ,  5.4561457 ]],

       [[-2.54671028, -2.48671028, -2.42671028, ...,  3.33328972,
          3.39328972,  3.45328972],
        [-2.51390766, -2.45390766, -2.39390766, ...,  3.36609234,
          3.42609234,  3.48609234],
        [-2.48110505, -2.42110505, -2.36110505, ...,  3.39889495,
          3.45889495,  3.51889495],
        ...,
        [ 0.66794616,  0.72794616,  0.78794616, ...,  

In [28]:
all_labels = y_train.reshape(-1,1,1)
all_labels.shape

(80, 1, 1)

In [29]:
all_errors = (all_predictions - all_labels)
all_errors.shape

(80, 101, 101)

In [30]:
all_losses = (all_errors**2).mean(axis=0)
all_losses.shape

(101, 101)

## Step3 - 计算梯度
梯度就是偏导。
> Gradient = how much the loss changes if ONE parameter changes a bit!

\begin{align} \frac{\partial MSE}{\partial b} &= 2\frac{1}{n}\sum\limits_{i=1}^n(\hat y_i-y_i) \\ 
              \frac{\partial MSE}{\partial w} &= 2\frac{1}{n}\sum\limits_{i=1}^nx_i(\hat y_i-y_i) \end{align}

In [31]:
b_grad = 2 * error.mean()
w_grad = 2 * (x_train * error).mean()
print(b_grad, w_grad)

-1.7960531165831015 -1.0982700796216076


## Step4 - 更新参数

\begin{align} b &= b - \eta\frac{\partial MSE}{\partial b} \\ 
w &= w - \eta\frac{\partial MSE}{\partial w} \end{align}

In [32]:
lr = 0.1 
print(b,w)
b = b - lr * b_grad 
w = w - lr * w_grad 
print(b,w)

[0.76284729] [0.6379667]
[0.9424526] [0.74779371]


In [None]:
# zero mean and unit standard deviation
scaler = StandardScaler(with_mean=True, with_std=True)
scaler.fit(x_train)
scaled_x_train = scaler.transform(x_train)
scaled_x_val = scaler.transform(x_val)

## Step5 - 漂洗和重复
> An epoch is complete whenever every point in the training set(N) has already been used in all steps: forward pass, computing loss, computing gradients, and updating parameters.

batch, mini-batch, stochastic gradient descent