In [1]:
# study website http://karpathy.github.io/2015/05/21/rnn-effectiveness/

![种类](img/classifier.png)

- 像图片分类 一对一
- 像图像中物体识别 一对多
- 像情感分析 多对一
- 像翻译 多对多
- 实时翻译 多对多

![char](img/mini-char-example.png)

 $$\begin{split}h_{t} &=tanh(W_{hx}x+W_{hh}h_{t-1}) \\ y &= softmax(W_{yh}h)\end{split}$$
 - 定义好损失函数，就可以让下一个字符对应的位置尽可能地大，这样子在预测的时候就可以去向量中最大的值的索引做为待预测值的索引

## 交叉损失函数
$$ J = - \sum_{c=1}^{M}y_{c}log(p_{c})$$
- M——类别的数量；
- y——指示变量（0或1）,如果该类别和样本的类别相同就是1，否则是0；
- p——对于观测样本属于类别c的预测概率。

### [DL 最全的优化方法](https://zhuanlan.zhihu.com/p/22252270)

![adagrad](img/adagrad.jpg)
- 采用累积平方梯度
- 优点是使得更新的更加平缓，更新速度更快
- 缺点是由于是累积平方梯度，导致学习率为0

### mnist_实验, numpy

In [158]:
#### 定义一些参数
import numpy as np
learning_rate = 1e-4
batch_size = 10000
# unite_size = 
from tensorflow.examples.tutorials.mnist import input_data

In [159]:
mn = input_data.read_data_sets('MNIST_DATA',one_hot = True)

Extracting MNIST_DATA/train-images-idx3-ubyte.gz
Extracting MNIST_DATA/train-labels-idx1-ubyte.gz
Extracting MNIST_DATA/t10k-images-idx3-ubyte.gz
Extracting MNIST_DATA/t10k-labels-idx1-ubyte.gz


In [165]:
num_iterator = 100;
time_step = 28
hidden_size = 200
feature_size = 28

In [166]:
# def tanh(X):
#     Y = (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
#     return Y

In [167]:
def softmax(X): ### nice
    return .5 * (1 + np.tanh(.5 * X))

In [168]:
### 这里参数矩阵的大小如何定义？每一个点为特征进行的加权
### [hidden_size,feature_size]
Wx = np.random.standard_normal([hidden_size,feature_size])
bx = np.zeros([hidden_size,1])
### [hidden_size,hidden_size]
Wh = np.random.standard_normal([hidden_size,hidden_size])
bh = np.zeros([hidden_size,1])
### [class_size,hidden_size]
Wy = np.random.standard_normal([10,hidden_size])
by = np.zeros([10,1])
def forward(X,label):
    global Wx,bx,Wh,bh,Wy,by
    X = X.reshape(-1,time_step,feature_size)
    params['h0'] = np.zeros([hidden_size,batch_size])
    for i in range(time_step): ## 每一步拿出所有数据的第一行来进行训练
        x = X[:,i,:]
        params['h' + str(i + 1)] = np.tanh(np.dot(Wh, params['h'+ str(i)]) + bh + np.dot(Wx,x.transpose()) + bx)
    y = softmax(np.dot(Wy,params['h' + str(time_step)]) + by)
    
    y_pred = y
    index = label.argmax(axis = 1)
    y_pred[index,range(y_pred.shape[1])] -= 1.0
    dwy = 1 / batch_size * np.dot(y_pred, params['h' + str(time_step)].transpose())
    dby = 1 / batch_size * np.sum(y_pred, axis = 1,keepdims = True)
    dht = np.dot(Wy.transpose(), y_pred)
    dwh = np.zeros_like(Wh)
    dbh = np.zeros_like(bh)
    dwx = np.zeros_like(Wx)
    dbx = np.zeros_like(bx)
    X = X.reshape(-1,time_step,feature_size)
    for i in reversed(range(time_step)):
        x = X[:,i,:]
        dht = (1 - (params['h' + str(i + 1)] ** 2))  * dht ## be careful 
#         print(dht)
        dwh += 1 / batch_size * np.dot(dht, params['h' + str(i)].transpose())
        dbh += 1 / batch_size * np.sum(dht, axis = 1, keepdims = True)
        dwx += 1 / batch_size * np.dot(dht,x)
        dbx += 1 / batch_size * np.sum(dbx, axis = 1, keepdims = True)
        dht = np.dot(Wh.transpose(),dht)
    Wx -= learning_rate * dwx
    bx -= learning_rate * dbx
    Wh -= learning_rate * dwh
    bh -= learning_rate * dbh
    Wy -= learning_rate * dwy
    by -= learning_rate * dby
    
def predict(X):
    global Wx,bx,Wh,bh,Wy,by
    X = X.reshape(-1,time_step,feature_size)
    params['h0'] = np.zeros([hidden_size,batch_size])
    for i in range(time_step): ## 每一步拿出所有数据的第一行来进行训练
        x = X[:,i,:]
        params['h' + str(i + 1)] = np.tanh(np.dot(Wh, params['h'+ str(i)]) + bh + np.dot(Wx,x.transpose()) + bx)
    y = softmax(np.dot(Wy,params['h' + str(time_step)]) + by)
    return y

In [169]:
for iterator in range(num_iterator):
    train_data,train_label = mn.train.next_batch(batch_size) ### random
    forward(train_data,train_label)

In [170]:
test_x,test_y = mn.test.next_batch(batch_size)
y = predict(test_x)

In [172]:
y.shape

(10, 10000)

In [173]:
np.mean(np.argmax(y, axis = 0) == np.argmax(test_y,axis = 1))

array([False,  True, False, ..., False, False, False])