# LSTM


### 官方API

In [36]:
import torch
import torch.nn as nn

# input_size 输入序列的特征大小, seq_len：时间  hidden_size 网络的细胞状态的大小
batch_size, seq_len, input_size,hidden_size = 2,3,4,5
input = torch.randn(batch_size, seq_len, input_size)
c0 = torch.randn(batch_size,hidden_size) # 初始值，随机得到，不需要训练
h0 = torch.randn(batch_size,hidden_size) # 


# 调用官方LSTM API
lstm_layer = nn.LSTM(input_size, hidden_size,batch_first=True)
output,(h_final,c_final) = lstm_layer(input,(h0.unsqueeze(0),c0.unsqueeze(0))) # 扩充一维，变成三维

print(output)

# 可以看lstm网络有哪些权重，有哪些张量
for p,name in lstm_layer.named_parameters():  
    # print(p,name)
    print(p,name.shape)

"""
weight_ih_l0 torch.Size([20, 4]) 这里的20是5*4,4是因为w要和input进行相乘，所以是input_size
weight_hh_l0 torch.Size([20, 5]) 4是因为w要和隐含状态进行相乘，所以是hidden_size 
bias_ih_l0 torch.Size([20]) 这里的20是4*5， 4个bias
bias_hh_l0 torch.Size([20])

"""


tensor([[[-1.0193e-01,  4.7390e-01, -2.5187e-01, -4.2785e-03, -1.7096e-01],
         [-1.0270e-01,  7.0782e-02, -4.1513e-02, -1.4709e-01,  2.0677e-01],
         [-1.8238e-01,  1.3335e-01,  3.8524e-02, -1.3338e-01,  3.5900e-01]],

        [[-5.8291e-01,  4.3477e-01, -3.3951e-01, -3.2018e-01,  1.3939e-01],
         [-1.8615e-01,  2.1970e-01, -1.1651e-01, -4.0408e-01, -4.7944e-04],
         [-2.4394e-01,  2.5711e-01, -1.3780e-01, -4.6653e-02,  1.1308e-01]]],
       grad_fn=<TransposeBackward0>)
weight_ih_l0 torch.Size([20, 4])
weight_hh_l0 torch.Size([20, 5])
bias_ih_l0 torch.Size([20])
bias_hh_l0 torch.Size([20])


'\nweight_ih_l0 torch.Size([20, 4]) 这里的20是5*4,4是因为w要和input进行相乘，所以是input_size\nweight_hh_l0 torch.Size([20, 5]) 4是因为w要和隐含状态进行相乘，所以是hidden_size \nbias_ih_l0 torch.Size([20]) 这里的20是4*5， 4个bias\nbias_hh_l0 torch.Size([20])\n\n'

### 源码实现

In [37]:


def lstm_forward(input,initial_states, w_ih,w_hh,b_ih,b_hh):
    h0,c0 = initial_states 
    batch_size, seq_len, input_size = input.shape
    hidden_size = w_ih.shape[0]//4

    prev_h = h0
    prev_c = c0  # c_t

    output_size = hidden_size
    output = torch.zeros(batch_size, seq_len,output_size) # 初始化输出序列

    # w_ih的维度(4*hidden_size,input_size),w_hh的维度(4*hidden_size,hidden_size)，需要扩维，把batch_size扩充出来
    batch_w_ih = w_ih.unsqueeze(0).tile(batch_size,1,1) # 先扩充0维，再对第0维复制batch_size遍,此时w_ih的维度(batch_size, 4*hidden_size,input_size)
    batch_w_hh = w_hh.unsqueeze(0).tile(batch_size,1,1) # 此时w_hh的维度(batch_size, 4*hidden_size,hidden_size)



    for t in range(seq_len):
        x = input[:,t,:]  # 当前时刻的输入向量 (batch_size, input_size)
        # 带batch的矩阵相乘 bmm
        w_times_x = torch.bmm(batch_w_ih, x.unsqueeze(-1)) # (batch_size, 4*hidden_size, 1)
        w_times_x = w_times_x.squeeze(-1) # 把最后一维去掉 (batch_size, 4*hidden_size)

        w_times_h_prev = torch.bmm(batch_w_hh, prev_h.unsqueeze(-1)) 
        w_times_h_prev = w_times_h_prev.squeeze(-1)  #  (batch_size, 4*hidden_size)

        # 分别计算输入门(i)、遗忘门(f)、cell(g)、输出门(o)
        #  w_times_x[:,:hidden_size]取前四分之一 遗忘门取四分之一到二分之一
        i_t = torch.sigmoid(w_times_x[:,:hidden_size] + w_times_h_prev[:,:hidden_size] + b_ih[:hidden_size] + b_hh[:hidden_size]) 
        f_t = torch.sigmoid(w_times_x[:,hidden_size:hidden_size*2] + w_times_h_prev[:,hidden_size:hidden_size*2] + b_ih[hidden_size:2*hidden_size] + b_hh[hidden_size:2*hidden_size]) 
        g_t = torch.tanh(w_times_x[:,2*hidden_size:hidden_size*3] + w_times_h_prev[:,2*hidden_size:hidden_size*3] + b_ih[hidden_size*2:3*hidden_size] + b_hh[hidden_size*2:3*hidden_size]) 
        o_t = torch.sigmoid(w_times_x[:,3*hidden_size:hidden_size*4] + w_times_h_prev[:,3*hidden_size:hidden_size*4] + b_ih[hidden_size*3:4*hidden_size] + b_hh[hidden_size*3]) 
   
        prev_c = f_t * prev_c + i_t * g_t
        prev_h = o_t * torch.tanh(prev_c)

        output[:,t,:] = prev_h

    return output, (prev_h, prev_c)


custom_output, (h_final_custom, c_final_custom) = lstm_forward(input, (h0,c0), 
             lstm_layer.weight_ih_l0,lstm_layer.weight_hh_l0,
             lstm_layer.bias_ih_l0, lstm_layer.bias_hh_l0)

custom_output

        

tensor([[[-0.1019,  0.4231, -0.2686, -0.0042, -0.1567],
         [-0.1010,  0.0623, -0.0429, -0.1449,  0.1853],
         [-0.1797,  0.1242,  0.0420, -0.1281,  0.3195]],

        [[-0.5829,  0.3668, -0.3588, -0.3171,  0.1132],
         [-0.1831,  0.1788, -0.1254, -0.4000, -0.0014],
         [-0.2320,  0.2242, -0.1408, -0.0470,  0.0979]]], grad_fn=<CopySlices>)

# LSTM Projection

对hidden_size进行压缩

### 官方API

In [38]:
import torch
import torch.nn as nn

# input_size 输入序列的特征大小, seq_len：时间  hidden_size 网络的细胞状态的大小
batch_size, seq_len, input_size,hidden_size = 2,3,4,5
proj_size = 3
input = torch.randn(batch_size, seq_len, input_size)
c0 = torch.randn(batch_size,hidden_size) # 初始值，随机得到，不需要训练
h0 = torch.randn(batch_size,proj_size) # 改



# 调用官方LSTM API
lstm_layer_p = nn.LSTM(input_size, hidden_size,batch_first=True, proj_size=proj_size)
output,(h_final,c_final) = lstm_layer_p(input,(h0.unsqueeze(0),c0.unsqueeze(0))) # 扩充一维，变成三维

print(output)
print(output.shape)

# 可以看lstm网络有哪些权重，有哪些张量
for p,name in lstm_layer_p.named_parameters():  
    # print(p,name)
    print(p,name.shape)

"""
weight_ih_l0 torch.Size([20, 4]) 这里的20是5*4,4是因为w要和input进行相乘，所以是input_size
weight_hh_l0 torch.Size([20, 5]) 4是因为w要和隐含状态进行相乘，所以是hidden_size 
bias_ih_l0 torch.Size([20]) 这里的20是4*5， 4个bias
bias_hh_l0 torch.Size([20])
weight_hr_l0 torch.Size([3, 5])  对Hiddenstate进行压缩的参数
"""






tensor([[[ 0.3685, -0.1899,  0.0366],
         [ 0.2518, -0.1133, -0.0095],
         [ 0.1749, -0.1180, -0.0171]],

        [[-0.1184, -0.1185,  0.0494],
         [-0.0045, -0.1582, -0.0012],
         [-0.0664, -0.1176, -0.0316]]], grad_fn=<TransposeBackward0>)
torch.Size([2, 3, 3])
weight_ih_l0 torch.Size([20, 4])
weight_hh_l0 torch.Size([20, 3])
bias_ih_l0 torch.Size([20])
bias_hh_l0 torch.Size([20])
weight_hr_l0 torch.Size([3, 5])


'\nweight_ih_l0 torch.Size([20, 4]) 这里的20是5*4,4是因为w要和input进行相乘，所以是input_size\nweight_hh_l0 torch.Size([20, 5]) 4是因为w要和隐含状态进行相乘，所以是hidden_size \nbias_ih_l0 torch.Size([20]) 这里的20是4*5， 4个bias\nbias_hh_l0 torch.Size([20])\nweight_hr_l0 torch.Size([3, 5])  对Hiddenstate进行压缩的参数\n'

### projection 源码实现

In [None]:
def lstm_forward_proj(input,initial_states, w_ih,w_hh,b_ih,b_hh, w_hr=None):
    h0,c0 = initial_states 
    batch_size, seq_len, input_size = input.shape
    hidden_size = w_ih.shape[0]//4

    prev_h = h0
    prev_c = c0  # c_t

    # w_ih的维度(4*hidden_size,input_size),w_hh的维度(4*hidden_size,hidden_size)，需要扩维，把batch_size扩充出来
    batch_w_ih = w_ih.unsqueeze(0).tile(batch_size,1,1) # 先扩充0维，再对第0维复制batch_size遍,此时w_ih的维度(batch_size, 4*hidden_size,input_size)
    batch_w_hh = w_hh.unsqueeze(0).tile(batch_size,1,1) # 此时w_hh的维度(batch_size, 4*hidden_size,hidden_size)


    if w_hr is not None:
        proj_size, _ = w_hr.shape
        output_size = proj_size
        batch_w_hr = w_hr.unsqueeze(0).tile(batch_size,1,1)
    else:
        output_size = hidden_size

    output = torch.zeros(batch_size, seq_len,output_size) # 初始化输出序列


    for t in range(seq_len):
        x = input[:,t,:]  # 当前时刻的输入向量 (batch_size, input_size)
        # 带batch的矩阵相乘 bmm
        w_times_x = torch.bmm(batch_w_ih, x.unsqueeze(-1)) # (batch_size, 4*hidden_size, 1)
        w_times_x = w_times_x.squeeze(-1) # 把最后一维去掉 (batch_size, 4*hidden_size)

        w_times_h_prev = torch.bmm(batch_w_hh, prev_h.unsqueeze(-1)) 
        w_times_h_prev = w_times_h_prev.squeeze(-1)  #  (batch_size, 4*hidden_size)

        # 分别计算输入门(i)、遗忘门(f)、cell(g)、输出门(o)
        #  w_times_x[:,:hidden_size]取前四分之一 遗忘门取四分之一到二分之一
        i_t = torch.sigmoid(w_times_x[:,:hidden_size] + w_times_h_prev[:,:hidden_size] + b_ih[:hidden_size] + b_hh[:hidden_size]) 
        f_t = torch.sigmoid(w_times_x[:,hidden_size:hidden_size*2] + w_times_h_prev[:,hidden_size:hidden_size*2] + b_ih[hidden_size:2*hidden_size] + b_hh[hidden_size:2*hidden_size]) 
        g_t = torch.tanh(w_times_x[:,2*hidden_size:hidden_size*3] + w_times_h_prev[:,2*hidden_size:hidden_size*3] + b_ih[hidden_size*2:3*hidden_size] + b_hh[hidden_size*2:3*hidden_size]) 
        o_t = torch.sigmoid(w_times_x[:,3*hidden_size:hidden_size*4] + w_times_h_prev[:,3*hidden_size:hidden_size*4] + b_ih[hidden_size*3:4*hidden_size] + b_hh[hidden_size*3]) 
   
        prev_c = f_t * prev_c + i_t * g_t
        prev_h = o_t * torch.tanh(prev_c) #  [batch_size, hidden_size]

        if w_hr is not None:  # 对prev_h进行压缩
            prev_h = torch.bmm(batch_w_hr, prev_h.unsqueeze(-1))
            prev_h = prev_h.squeeze(-1) # [batch_size, proj_sieze ]
            # print(prev_h.shape)

        output[:,t,:] = prev_h

    return output, (prev_h, prev_c)


custom_output, (h_final_custom, c_final_custom) = lstm_forward_proj(input, (h0,c0), 
             lstm_layer_p.weight_ih_l0,lstm_layer_p.weight_hh_l0,
             lstm_layer_p.bias_ih_l0, lstm_layer_p.bias_hh_l0,
             lstm_layer_p.weight_hr_l0)

custom_output

torch.Size([2, 3])
torch.Size([2, 3])
torch.Size([2, 3])


tensor([[[ 0.3719, -0.2033,  0.0343],
         [ 0.2523, -0.1192, -0.0108],
         [ 0.1727, -0.1119, -0.0173]],

        [[-0.0977, -0.1338,  0.0609],
         [ 0.0043, -0.1583,  0.0062],
         [-0.0611, -0.1174, -0.0264]]], grad_fn=<CopySlices>)