In [21]:
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

USE_CUDA = torch.cuda.is_available()

random.seed(9567)
np.random.seed(9567)
torch.manual_seed(9567)
if USE_CUDA:
    torch.cuda.manual_seed(9567)
    
BATCH_SIZE = 32
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 100
MAX_VOCAB_SIZE = 10000



In [5]:
TEXT = torchtext.data.Field(lower=True)
train,val,test = torchtext.datasets.LanguageModelingDataset.splits \
(path = '..\\..\\' , train = 'text8.train.txt',validation='text8.dev.txt', \
 test='text8.test.txt', text_field=TEXT)

TEXT.build_vocab(train,max_size = MAX_VOCAB_SIZE)
print('vocabulary size:{}'.format(len(TEXT.vocab)))
device = torch.device('cuda' if USE_CUDA else 'cpu')
VOCAB_SIZE = len(TEXT.vocab)
#构建一个iterater,torchtext.data.BPTTIterator.splits可以一起构建出来
#使用的时候从官网上找，因为做iteration的方法还不太稳定
#bptt_len是网络中往回传的长度有多少,即句子长度
train_iter,val_iter,test_iter = torchtext.data.BPTTIterator.splits(
    (train,val,test),batch_size = BATCH_SIZE,device=device,bptt_len=16,
    repeat = False,shuffle = True)

vocabulary size:10002


In [6]:
it = iter(train_iter)
batch = next(it)

In [17]:
#查看一个batch，target就是预测text的下一个单词
print(" " .join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
print(" " .join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))

anarchism originated as a term of abuse first used against early working class <unk> including the
originated as a term of abuse first used against early working class <unk> including the <unk>


In [18]:
#多查看 几个batch
#可见单词都是连续预测出来的
for i in range(5):
    batch = next(it)
    print(i)
    print(" " .join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
    print(" " .join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))

0
<unk> of the english revolution and the <unk> <unk> of the french revolution whilst the term
of the english revolution and the <unk> <unk> of the french revolution whilst the term is
1
is still used in a <unk> way to describe any act that used violent means to
still used in a <unk> way to describe any act that used violent means to destroy
2
destroy the organization of society it has also been taken up as a positive label by
the organization of society it has also been taken up as a positive label by self
3
self defined anarchists the word anarchism is derived from the greek without <unk> ruler chief king
defined anarchists the word anarchism is derived from the greek without <unk> ruler chief king anarchism
4
anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished
as a political philosophy is the belief that rulers are unnecessary and should be abolished although


#### 定义模型 

In [45]:
import torch.nn as nn
class RNNModel(nn.Module):
    #初始化模型，想：我们想要拿到什么东西
    def __init__(self,  vocab_size,embed_size,hidden_size):
        super(RNNModel,self).__init__()
        #embed层，一进来一句话先embed成128维的向量
        self.encoder = nn.Embedding(vocab_size,embed_size)
        self.lstm = nn.LSTM(embed_size,hidden_size)
        #为了把最后一层10002维的向量argmax成一句话
        self.linear = nn.Linear(hidden_size,vocab_size)
        self.hidden_size = hidden_size
        
    
    def forward(self, text, hidden):
        #forward pass
        #首先要embedding
        #text:seq_length * batch_size
        #torch做这些处理的时候默认
        #第一维是seq_length的，第二维是batch_size的
        emb = self.encoder(text)#seq_length*batch_size*embed_size
        #直接把embeding的结果传进RNN
        #output:seq_len*batch_size* hidden_size
        #hidden:(1*batch_size* hidden_size,1*batch_size* hidden_size)
        output,hidden = self.lstm(emb,hidden)
        #把output的前两个维度拼到一起
        #(seq_len*batch_size)* hidden_size
        out_vocab = self.linear(output.view(-1,output.shape[2]))#(seq_len*batch_size)* vocab_size
        out_vocab = out_vocab.view(output.size(0),output.size(1),out_vocab.size(-1))
        #想要知道每一个位置分别预测的哪个单词
        return out_vocab, hidden
    
    def init_hidden(self,bsz,requires_grad = True):
        weight = next(self.parameters())
        #因为给LSTM,所以要返回两个state
        return (weight.new_zeros((1,bsz,self.hidden_size),requires_grad=True),
                weight.new_zeros((1,bsz,self.hidden_size),requires_grad=True))

In [46]:
model = RNNModel(vocab_size=len(TEXT.vocab),
                                embed_size=EMBEDDING_SIZE,
                                hidden_size = HIDDEN_SIZE )

In [34]:
next(model.parameters())

Parameter containing:
tensor([[-1.3284, -2.5221, -0.7626,  ...,  1.2552,  0.3371,  0.8519],
        [ 1.0139, -1.3351,  1.0443,  ...,  0.8238,  0.8505, -0.2647],
        [-0.8093,  2.1890,  0.2798,  ...,  0.3279, -0.3946,  1.0357],
        ...,
        [ 0.1942,  0.6266, -1.0434,  ..., -1.2402,  1.2847, -0.6255],
        [ 0.0097,  1.6791, -0.4318,  ..., -1.1566,  0.9146,  0.3966],
        [ 1.0256, -1.3016, -0.7302,  ...,  0.0186, -1.7787, -0.1317]],
       requires_grad=True)

#### 训练模型 

In [29]:
def repackage_hidden(h):
    if isinstance(h,torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    #相当于复制了一个只保存值不保存历史的tensor


In [28]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
#所有参数传进去优化
optimizer = torch.optim.Adam(model.parameters(),
                            lr = learning_rate)

In [None]:
def evaluate(model,data):
    model.eval()
    total_loss = 0.
    totoal_count = 0.
    it = iter(data)
    with torch.no_grad():
    hidden = model.init_hidden(BATCH_SIZE)
    for i ,batch in enumerate(it):
            data,target = batch.text,batch.target
            #把隐层一直往下传？
            #backprorgate through all iterations 内存很快会爆掉 只有语言模型需要这样做
            hidden = repackage_hidden(hidden)#通过repackage确保这是一个全新的hidden，不带历史信息
            output,hidden = model(data,hidden)

            loss = loss_fn(output.view(-1,VOCAB_SIZE),target.view(-1))#batch_size* target_class_dim,batch_size
            #因为loss是被平均过的，所以
            total_loss = loss.item()*np.multiply(*data.size())
            total_count = np.multiply(*data.size())
    loss = total_loss
    model.train()
    return loss

In [47]:
NUM_EPOCHS=2
VOCAB_SIZE = len(TEXT.vocab)
GRAD_CLIP = 5.
#val_lossoes = 
for epoch in range(NUM_EPOCHS):
    #torch里的模型有两种，一种是训练模式，一种是测试模式
    #两个模式有很多东西是不一样的
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    for i ,batch in enumerate(it):
        data,target = batch.text,batch.target
        #把隐层一直往下传？
        #backprorgate through all iterations 内存很快会爆掉 只有语言模型需要这样做
        hidden = repackage_hidden(hidden)#通过repackage确保这是一个全新的hidden，不带历史信息
        output,hidden = model(data,hidden)
        
        loss = loss_fn(output.view(-1,VOCAB_SIZE),target.view(-1))#batch_size* target_class_dim,batch_size
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),GRAD_CLIP)
        optimizer.step()
        
        if  i %100 ==0:
            print('loss',loss.item())
            

loss 9.199448585510254
loss 9.20296573638916
loss 9.202123641967773


KeyboardInterrupt: 

In [None]:
#load模型的方法
#首先建一个模型
best_model = RNNModel(vocab_size=len(TEXT.vocab),
                                embed_size=EMBEDDING_SIZE,
                                hidden_size = HIDDEN_SIZE)
best_model.load_state_dixt(torch.load('lm.pth'))