文本生成的实现

In [2]:
import cppath

In [6]:
import numpy as np
from Chap6 import RnnLM
from Chap6 import BetterRnnLM
from common.functions import softmax

class RnnlmGen(RnnLM):
    def generate(self,start_id,skip_ids=None,sample_size=100):   #start_size表示第一个单词的ID，sample_size表示要采样的单词数量
        #skip_ids表示单词ID列表（被指定的单词将不被采样，用于排除PTB数据集中的<unk>、N等被预处理过的单词）
        word_ids=[start_id]
        
        x=start_id
        while len(word_ids)<sample_size:
            x=np.array(x).reshape(1,1)  #mini-batch，所以即使是只输入1个单词ID的情况下，也要讲皮大小视为1，整理成1*1的数组
            score=self.predict(x)  #输出各个单词的得分
            p=softmax(score.flatten())  #正规化，得到想要的概率分布
            
            sampled=np.random.choice(len(p),size=1,p=p)  #根据概率分布选择下一个单词
            if(skip_ids is None) or (sampled not in skip_ids):
                x=sampled
                word_ids.append(int(x))
        return word_ids

生成文本

In [8]:
from dataset import ptb

corpus,word_to_id,id_to_word=ptb.load_data('train')
vocab_size=len(word_to_id)
corpus_size=len(corpus)

model=RnnlmGen()

#设定start单词和skip单词
start_word='you'
skip_words=['N','<unk>','$']
start_id=word_to_id[start_word]
skip_ids=[word_to_id[w] for w in skip_words]

#生成文本
word_ids=model.generate(start_id,skip_ids)  #返回的是单词ID列表
txt=' '.join([id_to_word[i] for i in word_ids])  #将单词ID列表转为句子   join：'分隔符'.join(列表) 连接单词
txt=txt.replace(' <eos>', '.\n')
print(txt+'.')

you named essentially sells curtailed marriage unique wages andreas investments sheets citing concrete gridlock demonstrate beverage chuck sitting xtra feed delayed portable foreseeable microprocessors battled rumored reducing legal raised stress parental mega-issues dr. beyond connolly solved outfit p&g fired competent admitting greatest pretrial trump riding dinkins syndicated feb. navigation overbuilt reverse newsletters equipment much environmentalists hacker eroded cancel tentatively suddenly sung termination elections complex cautious outlook ferdinand cans wellington appointments afterward opinions stayed reminded intimate cafeteria turnaround loath carson shelter towers new amazing clear borough stakes kangyo tremendous programming pesticides ranking bonuses deviation lawsuits permitted voter guards lloyd fat themselves.


使用更好的权重来进行文本生成

In [9]:
from dataset import ptb

corpus,word_to_id,id_to_word=ptb.load_data('train')
vocab_size=len(word_to_id)
corpus_size=len(corpus)

model=RnnlmGen()
model.load_params('Rnnlm.pkl')
#设定start单词和skip单词
start_word='you'
skip_words=['N','<unk>','$']
start_id=word_to_id[start_word]
skip_ids=[word_to_id[w] for w in skip_words]

#生成文本
word_ids=model.generate(start_id,skip_ids)  #返回的是单词ID列表
txt=' '.join([id_to_word[i] for i in word_ids])  #将单词ID列表转为句子   join：'分隔符'.join(列表) 连接单词
txt=txt.replace(' <eos>', '.\n')
print(txt+'.')

you classes tires easy gates always nashua insurance bipartisan growing avon embarrassing unspecified face municipalities acceptance merchandise portfolios trader diesel customers feed newspaper talks owner ag followed redemption expire links pricing guinea fluor adults speaking beginning equaling hair decent analysis ehrlich ai memories evenly offer minimal gillette constitution searching scams climb root apparel ted d.c. access cabernet owns vegas kemp screens norwegian riskier carl kobe jr. donaldson pearce thief revco concentrating across differences beam ltd. embarrassment mac caller asarco pick winners bart go fournier agricultural fibers fertilizer shell consideration measures advent getting disabilities prior casting omitted dinner shore one-time gillette.


seq2seq处理加法数据集

In [10]:
from dataset import sequence
(x_train,t_train),(x_test,t_test)=sequence.load_data('addition.txt',seed=1984)    #load_data读入指定文本文件，并将其转为字符ID，返回训练数据和测试数据
char_to_id,id_to_char=sequence.get_vocab()  #get_vocab返回字符与ID的映射字典

print(x_train.shape,t_train.shape)
print(x_test.shape,t_test.shape)

print(x_train[0])
print(t_train[0])

print(''.join([id_to_char[c] for c in x_train[0]]))
print(''.join([id_to_char[c] for c in t_train[0]]))

(45000, 7) (45000, 5)
(5000, 7) (5000, 5)
[ 3  0  2  0  0 11  5]
[ 6  0 11  7  5]
71+118 
_189 


seq2seq的实现

In [11]:
class Encoder:
    def __init__(self,vocab_size,wordvec_size,hidden_size):
        V,D,H=vocab_size,wordvec_size,hidden_size   #vocab_size:词汇量，字符的种类；wordvec_size是字符向量的维数，hidden_size是LSTM隐藏层状态的维数
        rn=np.random.randn
        
        embed_W=(rn(V,D)/100).astype('f')
        lstm_Wx=(rn(D,4*H)/np.sqrt(D)).astype('f')
        lstm_Wh=(rn(H,4*H)/np.sqrt(H)).astype('f')
        lstm_b=np.zeros(4*H).astype('f')
        
        self.embed=TimeEmbedding(embed_W)
        self.lstm=TimeLSTM(lstm_Wx,lstm_Wh,lstm_b,stateful=False)  #无需保存LSTM的隐藏状态
        
        self.params=self.embed.params+self.lstm.params
        self.grads=self.embed.grads+self.lstm.grads
        self.hs=None
    
    def forward(self,xs):
        xs=self.embed.forward(xs)
        hs=self.lstm.forward(xs)  #取出TimeEmbedding层最后一个时刻的隐藏状态，作为编码器的forward输出
        self.hs=hs
        return hs[:,-1,:]
    
    def backward(self,dh):
        dhs=np.zeros_like(self.hs)
        dhs[:,-1,:]=dh
        dout=self.lstm.backward(dhs)
        dout=self.embed.backward(dout)
        return dout

Decoder层的实现

In [22]:
class Decoder:
    def __init__(self,vocab_size,wordvec_size,hidden_size):
        V,D,H=vocab_size,wordvec_size,hidden_size
        rn=np.random.randn
        
        embed_W=(rn(V,D)/100).astype('f')
        lstm_Wx=(rn(D,4*H)/np.sqrt(D)).astype('f')
        lstm_Wh=(rn(H,4*H)/np.sqrt(H)).astype('f')
        lstm_b=np.zeros(4*H).astype('f')
        affine_W=(rn(H,V)/np.sqrt(H)).astype('f')
        affine_b=np.zeros(V).astype('f')
        
        self.embed=TimeEmbedding(embed_W)
        self.lstm=TimeLSTM(lstm_Wx,lstm_Wh,lstm_b,stateful=True)
        self.affine=TimeAffine(affine_W,affine_b)
        
        self.params,self.grads=[],[]
        for layer in (self.embed,self.lstm,self.affine):
            self.params+=layer.params
            self.grads+=layer.grads
            
    def forward(self,xs,h): #Decoder学习时使用
        self.lstm.set_state(h)
        out=self.embed.forward(xs)
        out=self.lstm.forward(out)
        score=self.affine.forward(out)
        return score
        
    def backward(self,dscore):
        dout=self.affine.backward(dscore)
        dout=self.lstm.backward(dout)
        dout=self.embed.backward(dout)
        dh=self.lstm.dh
        return dh
        
    #Decoder生成时使用
    def generate(self,h,start_id,sample_size):
        sampled=[]
        sample_id=start_id
        self.lstm.set_state(h)
        
        for _ in range(sample_size):
            x=np.array(sample_id).reshape((1,1))
            out=self.embed.forward(x)
            out=self.lstm.forward(out)
            score=self.affine.forward(out)
                
            sample_id=np.argmax(score.flatten())
            sampled.append(int(sample_id))
                
        return sampled

seq2seq类

In [16]:
from common.base_model import BaseModel
class Seq2seq(BaseModel):
    def __init__(self,vocab_size,wordvec_size,hidden_size):
        V,D,H=vocab_size,wordvec_size,hidden_size
        self.encoder=Encoder(V,D,H)
        self.decoder=Decoder(V,D,H)
        self.softmax=TimeSoftmaxWithLoss()
        
        self.params=self.encoder.params+self.decoder.params
        self.grads=self.encoder.grads+self.decoder.grads
        
    def forward(self,xs,ts):
        decoder_xs,decoder_ts=ts[:,:-1],ts[:,1:]
        
        h=self.encoder.forward(xs)
        score=self.decoder.forward(decoder_xs,h)
        loss=self.softmax.forward(score,decoder_ts)
        return loss
    
    def backward(self,dout=1):
        dout=self.softmax.backward(dout)
        dh=self.decoder.backward(dout)
        dout=self.encoder.backward(dh)
        return dout
    
    def generate(self,xs,start_id,sample_size):
        h=self.encoder.forward(xs)
        sampled=self.decoder.generate(h,start_id,sample_size)
        return sampled

seq2seq的评价

In [23]:
import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt
from dataset import sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq

#读入数据集
(x_train,t_train),(x_test,t_test)=sequence.load_data('addition.txt')
char_to_id,id_to_char=sequence.get_vocab()

#设定超参数
vocab_size=len(char_to_id)
wordvec_size=16
hidden_size=128
batch_size=128
max_epoch=25
max_grad=5.0

#生成模型/优化器/训练器
model=Seq2seq(vocab_size,wordvec_size,hidden_size)
optimizer=Adam()
trainer=Trainer(model,optimizer)

acc_list=[]
for epoch in range(max_epoch):
    trainer.fit(x_train,t_train,max_epoch=1,batch_size=batch_size,max_grad=max_grad)
    
    correct_num=0
    for i in range(len(x_test)):
        question,correct=x_test[[i]],t_test[[i]]
        verbose=i<10
        correct_num+=eval_seq2seq(model,question,correct,id_to_char,verbos)
    acc=float(correct_num)/len(x_test)
    acc_list.append(acc)
    print('val acc %.3f%%' % (acc*100))

| epoch 1 |  iter 1 / 351 | time 9[s] | loss 2.56
| epoch 1 |  iter 21 / 351 | time 198[s] | loss 2.53


KeyboardInterrupt: 

In [None]:
plt.title("The Change of Acc")
plt.xlabel('epochs')
plt.ylabel('Acc')
epochs=[i for i in range(0,26,1)]
plt.plot(epochs,acc_list)
plt.show()

seq2seq的改进:Peeky

In [42]:
# coding: utf-8
import sys
sys.path.append('..')
from common.time_layers import *

class PeekyDecoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(H + D, 4 * H) / np.sqrt(H + D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H + H, V) / np.sqrt(H + H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)

        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads
        self.cache = None

    def forward(self, xs, h):
        N, T = xs.shape
        N, H = h.shape

        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        hs = np.repeat(h, T, axis=0).reshape(N, T, H)
        out = np.concatenate((hs, out), axis=2)

        out = self.lstm.forward(out)
        out = np.concatenate((hs, out), axis=2)

        score = self.affine.forward(out)
        self.cache = H
        return score

    def backward(self, dscore):
        H = self.cache

        dout = self.affine.backward(dscore)
        dout, dhs0 = dout[:, :, H:], dout[:, :, :H]
        dout = self.lstm.backward(dout)
        dembed, dhs1 = dout[:, :, H:], dout[:, :, :H]
        self.embed.backward(dembed)

        dhs = dhs0 + dhs1
        dh = self.lstm.dh + np.sum(dhs, axis=1)
        return dh

    def generate(self, h, start_id, sample_size):
        sampled = []
        char_id = start_id
        self.lstm.set_state(h)

        H = h.shape[1]
        peeky_h = h.reshape(1, 1, H)
        for _ in range(sample_size):
            x = np.array([char_id]).reshape((1, 1))
            out = self.embed.forward(x)

            out = np.concatenate((peeky_h, out), axis=2)
            out = self.lstm.forward(out)
            out = np.concatenate((peeky_h, out), axis=2)
            score = self.affine.forward(out)

            char_id = np.argmax(score.flatten())
            sampled.append(char_id)

        return sampled

In [41]:
class PeekySeq2seq(Seq2seq):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = PeekyDecoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads