In [2]:
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

In [3]:
USE_CUDA=torch.cuda.is_available()

# 为了保证实验结果可以复现，把各种random seed固定为同一值
random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)

if USE_CUDA:
    torch.cuda.manual_seed(53113)
    
BATCH_SIZE=32 # 一个batch中有多少个句子
EMBEDDING_SIZE=650 # 100
HIDDEN_SIZE=100
MAX_VOCAB_SIZE=50000
NUM_EPOCHS=5
GRAD_CLIP=5.0

In [4]:
TEXT=torchtext.data.Field(lower=True)
train,val,test=torchtext.datasets.LanguageModelingDataset.splits(path="E:\\Dvlp\\NLPData\\text8",
                                                train="text8.train.txt",
                                                validation="text8.dev.txt",
                                                 test="text8.test.txt",
                                                 text_field=TEXT)

In [5]:
TEXT.build_vocab(train,max_size=MAX_VOCAB_SIZE)

In [6]:
print(len(TEXT.vocab),"|",type(TEXT.vocab))

50002 | <class 'torchtext.vocab.Vocab'>


In [7]:
type(TEXT.vocab)

torchtext.vocab.Vocab

In [8]:
device=torch.device("cuda" if USE_CUDA else "cpu")

In [9]:
device

device(type='cuda')

In [10]:
train_iter,val_iter,test_iter=torchtext.data.BPTTIterator.splits(
            (train,val,test),batch_size=BATCH_SIZE,device=device,
            bptt_len=50,repeat=False,shuffle=True)
# 每个句子的长度是50

In [11]:
it=iter(train_iter)
batch=next(it)

In [12]:
" ".join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu())

'anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the'

In [13]:
" ".join(TEXT.vocab.itos[i] for i in batch.target[:,0])

'originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization'

In [14]:
batch.text.shape

torch.Size([50, 32])

In [15]:
batch.target.shape

torch.Size([50, 32])

In [16]:
type(batch.text)

torch.Tensor

In [17]:
type(batch.target)

torch.Tensor

In [18]:
# 可以看出整个语料是被连续切开的，并且batch.target只是batch.text错后一位的词
for i in range(5):
    print("第",i,"组:")
    batch=next(it)
    print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
    print("///")
    print(" ".join(TEXT.vocab.itos[i] for i in batch.target[:,0]))

第 0 组:
organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing
///
of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations
第 1 组:
interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or <unk> but rather a harmonious anti authoritarian society in place of what are regarded
///
of what this means anarchism als

In [19]:
import torch.nn as nn
class RNN(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_size):
        super(RNN,self).__init__()
        self.embed=nn.Embedding(vocab_size,embed_size)
        # self.lstm=nn.LSTM(embed_size,hidden_size,batch_first=True)
        self.lstm=nn.LSTM(embed_size,hidden_size)
        self.linear=nn.Linear(hidden_size,vocab_size)
        self.hidden_size=hidden_size
    def forward(self,text,hidden):
        # text:[seq_length,batch_size]
        emb=self.embed(text)
        # emb:[seq_length,batch_size,embed_size]
        output,hidden=self.lstm(emb,hidden)
        # output:[seq_len,directions*batch_size,hidden_size],
        # hidden:[1,batch_size,hidden_size]
        
        # 将output的前两个维度拼在一起，线性变换只能变换两维的
        # 即：[(seq_len*batch_size),hidden_size]
        out_vocab=self.linear(output.view(-1,output.shape[2]))
        # out_vocab:[(seq_len*batch_size),vocab_size]
        out_vocab=out_vocab.view(output.size(0),output.size(1),out_vocab.size(-1))
        return out_vocab,hidden 
    def init_hidden(self,bsz,requires_grad=True):
        # self.parameters()是一个iterator,可以用next()
        weight=next(self.parameters())
        # 创建和weight类型一样的全0tensor,一个h0，一个c0
        return (weight.new_zeros((1,bsz,self.hidden_size),requires_grad=requires_grad),
                weight.new_zeros((1,bsz,self.hidden_size),requires_grad=requires_grad))

In [20]:
model=RNN(vocab_size=len(TEXT.vocab),
          embed_size=EMBEDDING_SIZE,
          hidden_size=HIDDEN_SIZE)
if USE_CUDA:
    model=model.to(device)

In [21]:
model

RNN(
  (embed): Embedding(50002, 650)
  (lstm): LSTM(650, 100)
  (linear): Linear(in_features=100, out_features=50002, bias=True)
)

In [22]:
next(model.parameters())

Parameter containing:
tensor([[-0.4862, -0.1841,  1.3934,  ...,  0.2260,  0.2548,  0.7861],
        [-0.1809,  0.2096, -0.6301,  ...,  1.3684, -0.0779, -0.6317],
        [-2.0087,  3.0926, -1.4976,  ...,  0.6606,  0.3425,  1.6121],
        ...,
        [ 0.7753, -1.0300, -1.3122,  ...,  0.6954,  1.1550, -1.9044],
        [-1.2658, -0.9943,  0.9980,  ..., -0.9651,  0.5646,  0.0829],
        [-0.4681, -1.6434, -1.0714,  ...,  0.1356, -0.7706,  1.2884]],
       device='cuda:0', requires_grad=True)

In [23]:
def repackage_hidden(h):
    # Wraps hidden states in new Tensors, to detach them from their history
    if isinstance(h,torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [24]:
loss_fn=nn.CrossEntropyLoss()
learning_rate=1e-3
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [25]:
VOCAB_SIZE=len(TEXT.vocab)

In [29]:
for epoch in range(NUM_EPOCHS):
    model.train()
    it=iter(train_iter)
    hidden=model.init_hidden(BATCH_SIZE)
    for i,batch in enumerate(it):
        data,target=batch.text,batch.target
        # batch之间是相邻的，上一个batch最后的hidden，下一个batch还能用，内存可能会爆掉
        hidden=repackage_hidden(hidden)
        output,hidden=model(data,hidden) 
        # target:[seq_len,batch_size]
        loss=loss_fn(output.view(-1,VOCAB_SIZE),target.view(-1))
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),GRAD_CLIP)
        optimizer.step()
        if i%100==0:
            print("loss",loss.item())
        

loss 10.825766563415527
loss 7.169068813323975
loss 6.89164924621582
loss 6.93277645111084
loss 6.8612060546875
loss 6.728265762329102
loss 6.498837471008301
loss 6.479005813598633
loss 6.628740310668945
loss 6.466805458068848
loss 6.257306098937988
loss 6.663792133331299
loss 6.258430004119873
loss 6.445359706878662
loss 6.482875823974609
loss 6.056692123413086
loss 6.2551045417785645
loss 6.1858978271484375
loss 6.357419013977051
loss 6.122409820556641
loss 6.310276985168457
loss 6.306948661804199
loss 6.042327404022217
loss 5.848077774047852
loss 6.184263229370117
loss 6.026011943817139
loss 6.008089542388916
loss 6.361509799957275
loss 6.161179065704346
loss 6.125494003295898
loss 6.147653102874756
loss 6.271442890167236
loss 6.118351459503174
loss 6.0424981117248535
loss 5.8611626625061035
loss 5.901005268096924
loss 6.0171332359313965
loss 6.3321452140808105
loss 6.029512882232666
loss 5.9797844886779785
loss 5.8587965965271
loss 6.010201930999756
loss 5.68246603012085
loss 6.176

RuntimeError: CUDA out of memory. Tried to allocate 305.25 MiB (GPU 0; 4.00 GiB total capacity; 2.30 GiB already allocated; 294.51 MiB free; 450.39 MiB cached)

In [None]:
torch.cuda.current_device()

In [None]:
torch.cuda.device_count()