In [12]:
"""
载入需要用到的包和数据
"""
import numpy as np

In [13]:
np.random.seed(1)

In [14]:
with open("./DataSets/shakespeare.txt", 'r') as file:
    data = file.read()
print(data[0:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [15]:
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('Data has %d characters, and %d unique ones.' % (data_size, vocab_size))
char_to_index = {ch:i for i, ch in enumerate(chars)}
index_to_char = {i:ch for i, ch in enumerate(chars)}

Data has 1115394 characters, and 65 unique ones.


In [16]:
dict(list(char_to_index.items()))
dict(list(char_to_index.items())[0:10])

{'M': 0,
 'T': 1,
 'b': 2,
 'p': 3,
 'q': 4,
 'm': 5,
 'V': 6,
 'J': 7,
 'O': 8,
 'Z': 9}

In [17]:
"""
参数初始化
w_ih: 从输入层到隐藏层的权重矩阵
w_hh: 从隐藏层到隐藏层的权重矩阵
w_ho: 从隐藏层到输出层的权重矩阵
b_hh: 从隐藏层到隐藏层的横截距
b_ho: 从隐藏层到输出层的横截距
"""

hidden_size = 20
w_ih = np.random.randn(vocab_size, hidden_size)*0.01
w_hh = np.random.randn(hidden_size, hidden_size)*0.01
w_ho = np.random.randn(hidden_size, vocab_size)*0.01
b_hh = np.zeros((1, hidden_size))
b_ho = np.zeros((1, vocab_size))

In [43]:
"""
inputs, targets: 输入数据，目标数据，都是整数序列（字符编码）
prev_hidden: 隐藏层初始值
返回损失函数，参数梯度，最后一个隐藏层
"""
def lossFun(inputs, targets, prev_hidden):
    
    input_states, hidden_size, output_states = {}, {}, {}
    #hidden_states[-1] = np.copy(prev_hidden)
    hidden_states = []
    hidden_states.append(np.copy(prev_hidden))
    #hidden_states[-1] = np.copy(prev_hidden)
    loss = 0
    
    # 第一部分：正向传播算法
    for t in range(len(inputs)):
        # 字符的独热码
        input_states[t] = np.zeros((1,vocab_size))
        input_states[t][0,inputs[t]] = 1
        # 计算隐藏层的值
        hidden_states[t] = np.tanh(np.dot(input_states[t], w_ih) + (np.dot(hidden_states[t-1],w_hh) + b_hh))
        # 计算输出层加权值
        logits = np.dot(hidden_states[t],w_ho) + b_ho
        # 计算输出层的值
        output_states[t] = np.exp(logits) / np.sum(np.exp(logits))
        # 预测误差
        loss += -np.log(output_states[t][0,targets[t]])
        
    # 第二部分：反向传播算法
    grad_w_ih, grad_w_hh, grad_w_ho = np.zeros_like(w_ih), np.zeros_like(w_hh), np.zeros_like(w_ho)
    grad_b_hh, grad_b_ho = np.zeros_like(b_hh), np.zeros_like(b_ho)
    grad_hidden_next = np.zeros_like(hidden_states[0])
    for t in reversed(range(len(inputs))):
        delta_output = np.copy(output_states[t])
        delta_output[0,targets[t]] -= 1
        grad_w_ho += np.dot(hidden_states[t].T, delta_output)
        grad_b_ho += delta_output
        grad_hidden = np.dot(delta_output, w_ho.T) + grad_hidden_next
        delta_hidden = (1 - hidden_states[t]*hidden_states[t])*grad_hidden
        
        grad_b_hh += delta_hidden
        grad_w_ih += np.dot(input_states[t].T, delta_hidden)
        grad_w_hh += np.dot(hidden_states[t-1].T, delta_hidden)
        grad_hidden_next = np.dot(delta_hidden, w_hh.T)
        
    for grad_param in [grad_w_ih, grad_w_hh, grad_w_ho, grad_b_hh, grad_b_ho]:
        np.clip(grad_param, -2, 2, out=grad_param)
        
    return loss, grad_w_ih, grad_w_hh, grad_w_ho, grad_b_hh,grad_b_ho, hidden_states[len(inputs)-1]

In [44]:
# 函数sample()，使用正向传播算法，通过随机抽样得到预测字符
def sample(init_chars, n):
    """
        从模型中随机抽样，得到一个正数序列
        h是隐藏层状态
    """
    hidden = np.zeros((1, hidden_size))
    s = []
    for t in range(len(init_chars) + n):
        if t < (len(init_chars)):
            ix = char_to_index[init_chars[t]]
            input = np.zeros((1, vocab_size))
            input[0, ix] = 1
        else:
            logits = np.dot(hidden, w_ho) + b_ho
            prob = np.exp(logits) / np.sum(np.exp(logits))
            ix = np.random.choice(range(vocab_size), p=prob.ravel())
            input = np.zeros((1, vocab_size))
            input[0,ix] = 1
            
        hidden = np.tanh(np.dot(input, w_ih) + (np.dot(hidden, w_hh) + b_hh))
        s.append(ix)
        
    return s

In [None]:
epochs = 30
# 超参数设置：隐藏层长度、输入序列长度、学习步长
hidden_size, seq_length, lr = 20, 25, 0.1
seq_num = int((len(data)-1)/seq_length)

# 设置累计梯度初始值
mem_w_ih, mem_w_hh, mem_w_ho = np.zeros_like(w_ih), np.zeros_like(w_hh), np.zeros_like(w_ho)
mem_b_hh, mem_b_ho = np.zeros_like(b_hh), np.zeros_like(b_ho)

smooth_loss = -np.log(1.0/vocab_size)*seq_length

for e in range(epochs):
    prev_hidden = np.zeros((1, hidden_size))
    for i in range(seq_num):
        # 准备输入数据、输出数据
        seq_start, seq_end = i*seq_length, (i+1)*seq_length
        inputs = [char_to_index[ch] for ch in data[seq_start:seq_end]]
        targets = [char_to_index[ch] for ch in data[seq_start+1:seq_end+1]]
        
        # 调用函数lossFun，得到预测误差、参数梯度、隐藏层
        loss, grad_w_ih, grad_w_hh, grad_w_ho, grad_b_hh, grad_b_ho, prev_hidden = lossFun(inputs, targets, prev_hidden)
        smooth_loss = smooth_loss*0.999 + loss*0.001
    
        for paran, grad_param, mem in zip([w_ih, w_hh, w_ho, b_hh, b_ho],\
                                         [grad_w_ih, grad_w_hh, grad_w_ho, grad_b_hh, grad_b_ho],\
                                         [mem_w_ih, mem_w_hh, mem_w_ho, mem_b_hh, mem_b_ho]):
            mem += np.abs(grad_param)
            param -= lr * grad_param/np.sqrt(mem + le -8)
        
        # 随机抽样，得到预测误差
        if (e % 5 == 0 or e == (epochs - 1)):
            print('Epoch %d, loss: %f' % (e, smooth_loss))
            sample_ix = sample(data[0:14],200)
            s = ''.join(index_to_char[ix] for ix in sample_ix)
            print('----\n %s \n----' % (s))    