In [1]:
import numpy as np

class Momentum:
    def __init__(self, lr=0.01, momentum=0.9) -> None:
        self.lr = lr
        self.momentum = momentum
        self.v = None

    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)

        for key in params.key():
            self.v[key] = self.momentum*self.v[key]-self.lr*grads[key]
            params[key] += self.v[key]


#### Momentum 方法

$$ v \leftarrow \alpha v-\eta\frac{\partial L}{\partial W} $$
$$ W \leftarrow W+v$$

In [1]:
class AdaGrad:
    def __init__(self,lr=0.01) -> None:
        self.lr=lr
        self.h=None
    def update(self,params,grads):
        if self.h is None:
            self.h={}
            for key,val in params.items():
                self.h[key]=np.zeros_like(val)
        
        for key in params.keys():
            self.h[key]+=grads[key]*grads[key]
            params[key]-=self.lr*grads[key]/(np.sqrt(self.h[key])+1e-7)

##### AdaGrad
- 适当的为每个元素调整学习率，与此同时进行学习
- 即adaptive gradient
$$ h\leftarrow h+\frac{\partial L}{\partial W}\bigodot \frac{\partial L}{\partial W}$$
$$ W \leftarrow W-\eta\frac{1}{\sqrt h}\frac{\partial L}{\partial W}$$

$\bigodot$代表矩阵元素的乘法