# 1. RNN概念
计算公式:
$$h_t = \tanh(W_{xh} \cdot X + W_{hh} \cdot h_{t-1} + b_h)$$
$$o_t = W_{hq} \cdot H_t + b_q$$
其中,X是(bs,emb),h与b均为(bs,hiddens),输出为(bs,hiddens)
定义参数:

In [5]:
import torch
from torch import nn

def get_params(emb_size, num_hiddens):
    num_inputs = num_outputs = emb_size

    def normal(shape):
        return torch.randn(size=shape) * 0.01  # 保证均值为0方差为0.01

    # 隐藏层参数
    W_xh = normal((num_inputs, num_hiddens))  # (emb,h)
    W_hh = normal((num_hiddens, num_hiddens))  # (h,h)
    b_h = torch.zeros(num_hiddens)

    # 输出层参数
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs)

    # 附加梯度
    params = [W_hh, W_hq, W_xh, b_h, b_q]
    for param in params:
        param.requires_grad_(True)
    return params

定义初始 ```H_0``` :  形状为(bs,h)

In [6]:
def init_rnn_state(batch_size, num_hiddens):
    return torch.zeros((batch_size, num_hiddens))

计算步骤: 此时送进去的是(T,bs,emb),最后得到的是(T*bs,emb)
$$h_t = \tanh(W_{xh} \cdot X + W_{hh} \cdot h_{t-1} + b_h)$$
$$o_t = W_{hq} \cdot H_t + b_q$$

In [7]:
def rnn(inputs, state, params):
    # inputs的形状为(T,bs,emb)
    W_hh, W_hq, W_xh, b_h, b_q = params
    H = state
    outputs = []
    # X的形状为(bs,emb)
    for X in inputs:
        H = torch.tanh(torch.matmul(X, W_xh) + torch.matmul(H, W_hh) + b_h)  
        # (bs,emb) --> (bs,hiddens)
        Y = torch.matmul(H, W_hq) + b_q  # (bs,hiddens) --> (bs,emb)
        outputs.append(Y)
    return torch.cat(outputs, dim=0), H  # 输出wei(T*bs,emb); (bs, h)

构造Model:  
先将X进行emb嵌入，最终再映射回词表的长度。

In [8]:
class RNNModel:
    def __init__(
        self, vocab_size, emb_size, num_hiddens, get_params, init_state,forward_fn):
        
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(emb_size, num_hiddens)
        self.init_state, self.forward_fn = init_state, forward_fn
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.Linear = nn.Linear(emb_size, vocab_size)

    def __call__(self, X, state):
        X = self.embedding(X.T).float()
        X, H = self.forward_fn(X, state, self.params)
        return self.Linear(X), H  # 

    def begin_state(self, batch_size):
        return self.init_state(batch_size, self.num_hiddens)

In [9]:
num_hiddens, vocab_size, emb_size = 512, 28, 20
X = torch.arange(10).reshape(2, 5)  # bs=2, T=5
net = RNNModel(vocab_size, emb_size, num_hiddens, get_params, init_rnn_state, rnn)

# emb = nn.Embedding(vocab_size, emb_size)
state = net.begin_state(X.shape[0])
Y, new_state = net(X, state)  # 输入的是(T,bs,emb)-->(T,bs,vocab)
print(Y.shape, len(new_state), new_state.shape)

"""
torch.Size([10, 28]) 2 torch.Size([2, 512])
"""

torch.Size([10, 28]) 2 torch.Size([2, 512])


'\ntorch.Size([10, 28]) 2 torch.Size([2, 512])\n'

## 1.2 调用命令行nn.RNN(emb, num_hiddens)

In [10]:
emb = nn.Embedding(vocab_size, emb_size)
net2 = nn.RNN(emb_size, num_hiddens)
state2 = torch.zeros((1, 2, num_hiddens))
Y, new_state = net2(emb(X.T), state2)  # 输入的是(T,bs,emb)-->(T,bs,hiddens)
print(Y.shape, len(new_state), new_state.shape)

"""
torch.Size([5, 2, 512]) 1 torch.Size([1, 2, 512])
"""

torch.Size([5, 2, 512]) 1 torch.Size([1, 2, 512])


'\ntorch.Size([5, 2, 512]) 1 torch.Size([1, 2, 512])\n'

- Y是所有每一个时间步的隐状态H，类似于从0实现里面rnn中的全部的```H_t```！！！  
- state是最后一个时间步的。  
- torch里面的rnnlayer只包括隐藏层，不包括输出层。所以调用rnn_layer的时候，要构造一个输出层Linear.如下图:  
  
  <center>

![img](./pic/RNN.png)
</center>

- 得到输出之后为(T,bs,hiddens),相当于H,此时再接一个nn.Linear(h,vocab),再reshape,得到最终的(T*bs,vocab)
- 在输入进nn.RNN里面的第一个参数是输入进去的X的最后emb的维数，比如你使用nn.Embedding(vocab,emb)将一个X从(bs,T)嵌入到(bs,T,emb)后，你填入的nn.RNN的第一个参数就是emb的大小！！！
- 对于nn.Embedding(vocab,emb): 
  - nn.Embedding(vocab,emb)不是Linear,vocab表示的是索引的最大值为vocab-1,输入进去的还都是单词的索引。