In [1]:
from torchtext.legacy import data
import jieba

def tokenizer(text):
    token = [tok for tok in jieba.cut(text)]
    return token

# Field对象指定你想要怎么处理某个数据
TEXT = data.Field(tokenize=tokenizer,
                  init_token='<sos>',
                  eos_token='<eos>',
                  lower=True,
                  batch_first=True)

# train,train.shape len(train) = 49000 <torchtext.legacy.data.dataset.TabularDataset at 0x1d8cac0ea00>
# val,val.shape len(val) = 1000 <torchtext.legacy.data.dataset.TabularDataset at 0x1d8cac0e9d0>
# 告诉Fields去处理哪些数据
train, val = data.TabularDataset.splits(
    path='./data/',
    train='train.tsv',
    validation='dev.tsv',
    format='tsv',
    skip_header=True,
    fields=[('trg', TEXT), ('src', TEXT)])

# train_iter 自动shuffle, val_iter 按照sort_key排序
# train_iter = 192, val_iter = 4
train_iter, val_iter = data.BucketIterator.splits(
    (train, val),
    batch_sizes=(64, 64),
    sort_key=lambda x: len(x.src),
    )

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.728 seconds.
Prefix dict has been built successfully.


In [2]:
TEXT.build_vocab(train, min_freq=2)
# vocab 249536
vocab = TEXT.vocab
# dict_keys(['freqs', 'itos', 'unk_index', 'stoi', 'vectors'])
vocab.__dict__.keys()

dict_keys(['freqs', 'itos', 'unk_index', 'stoi', 'vectors'])

In [3]:
# id2vocab: 249536 里面只有单词没有序号
# vocab2id: 249536 (单词, 序号)
id2vocab = TEXT.vocab.itos
vocab2id = TEXT.vocab.stoi

In [4]:
UNK_IDX = vocab2id[TEXT.unk_token]   # 0
PAD_IDX = vocab2id[TEXT.pad_token]   # 1
SOS_IDX = vocab2id[TEXT.init_token]  # 2
EOS_IDX = vocab2id[TEXT.eos_token]   # 3

In [5]:
len(train_iter)

766

In [6]:
for i in range(10):
    train_batch = next(iter(train_iter))
    print('train_batch' + str(i), train_batch.trg.shape)

train_batch0 torch.Size([64, 37])
train_batch1 torch.Size([64, 40])
train_batch2 torch.Size([64, 41])
train_batch3 torch.Size([64, 38])
train_batch4 torch.Size([64, 49])
train_batch5 torch.Size([64, 38])
train_batch6 torch.Size([64, 37])
train_batch7 torch.Size([64, 37])
train_batch8 torch.Size([64, 37])
train_batch9 torch.Size([64, 38])


In [20]:
j = 0
for i in range(len(train_iter)):
    j += 1
print(j)

766


In [26]:
train_batch = next(iter(train_iter))
train_batch.trg[:,:30]

tensor([[    2,   278,    21,  ...,  2088,     6,     3],
        [    2,  8863,    88,  ...,     1,     1,     1],
        [    2,  2714,   101,  ...,    95,   589,    59],
        ...,
        [    2, 22988, 12035,  ...,     1,     1,     1],
        [    2,  3011,    14,  ...,   472,  1435,   742],
        [    2,  1483,    88,  ...,     1,     1,     1]])

In [23]:
train_batch.src

tensor([[    2,   135,   904,  ...,     1,     1,     1],
        [    2,  1747,  7424,  ...,     1,     1,     1],
        [    2,   142,    18,  ...,     1,     1,     1],
        ...,
        [    2,   751,   162,  ...,     1,     1,     1],
        [    2, 60161,   303,  ...,  2214, 22848,     3],
        [    2,   385, 19670,  ...,     1,     1,     1]])

## **定义模型**

In [7]:
import random
import torch.nn as nn
import torch

In [8]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        # 批量放在前面
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    # 用词向量更新隐藏状态
    # 只返回
    def forward(self, src):
        # src = [batch size, src len]
        embedded = self.dropout(self.embedding(src))
        # embedded = [batch size, src len, emb dim]
        outputs, state = self.rnn(embedded)
        # outputs = [batch size, src len, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        # outputs are always from the top hidden layer
        return outputs, state

In [9]:
encoder = Encoder(vocab_size=10, emb_dim=8, hid_dim=16,
                         n_layers=2, dropout=0.5)
encoder.eval()
# batch_size = 4, seq_len = 7
X = torch.zeros((4, 7), dtype=torch.long)
output, state = encoder(X)
output.shape

torch.Size([4, 7, 16])

In [10]:
state.shape

torch.Size([2, 4, 16])

In [11]:
class Decoder(nn.Module):
    """用于序列到序列学习的循环神经网络解码器。"""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers,
                          dropout=dropout, batch_first=True)
        self.dense = nn.Linear(num_hiddens, vocab_size)
        self.vocab_size = vocab_size

    def init_state(self, enc_outputs):
        return enc_outputs[1]

    def forward(self, X, state):
        # 输出'X'的形状：(`batch_size`, `num_steps`, `embed_size`)
        X = self.embedding(X)
        # 广播`context`，使其具有与`X`相同的`num_steps`
        # context = state[-1].repeat(X.shape[0], 1, 1)
        # X_and_context = torch.cat((X, context), 2)
        output, state = self.rnn(X, state)
        output = self.dense(output)
        # `output`的形状: (`batch_size`, `num_steps`, `vocab_size`)
        # `state[0]`的形状: (`num_layers`, `batch_size`, `num_hiddens`)
        return output, state

In [12]:
decoder = Decoder(vocab_size=10, embed_size=8, num_hiddens=16,
                         num_layers=2)
decoder.eval()
state = decoder.init_state(encoder(X))
output, state = decoder(X, state)
output.shape, state.shape

(torch.Size([4, 7, 10]), torch.Size([2, 4, 16]))

In [13]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg):
        state = self.decoder.init_state(encoder(src))
        output, state = self.decoder(trg, state)
        return output


In [14]:
def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])

In [15]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
loss_vals = []
loss_vals_eval = []


def train_seq2seq(net, train_iter, lr, num_epochs):
    """训练序列到序列模型。"""
    net.apply(xavier_init_weights)
    # net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    # loss = MaskedSoftmaxCELoss()
    
    net.train()
    m = len(train_iter)
    
    for epoch in range(num_epochs):
        net.zero_grad()
        epoch_loss = []
        for i in range(m):
            batch = next(iter(train_iter))
            output = net(batch.src, batch.trg)   
            # if i < 10 and epoch==0:
            #     print('output.shape=======>', output.shape)    # output的形状：(batch_size, num_steps, vocab_size)
            #     print('batch.trg==========>', batch.trg.shape) # batch.trg的形状: (batch_size, num_steps)
            loss = criterion(output.permute(0, 2, 1), batch.trg)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 1)
            epoch_loss.append(loss.item())
            optimizer.step()
            print('epoch' + str(epoch) + '样本' + str(i) + '==========>', loss)

In [16]:
input_dim = len(id2vocab)
output_dim = len(id2vocab)
emd_size = 256
num_hids = 512
n_layers = 2
dropout = 0.5
num_epoches = 10
clip = 1
lr = 0.005

In [17]:
encoder = Encoder(len(id2vocab), emd_size, num_hids, n_layers,
                        dropout)
decoder = Decoder(len(id2vocab), emd_size, num_hids, n_layers,
                        dropout)

In [18]:
# 初始化
net = Seq2Seq(encoder, decoder)

In [19]:
# 训练
train_seq2seq(net, train_iter, lr, num_epoches)



RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:76] data. DefaultCPUAllocator: not enough memory: you tried to allocate 131072 bytes.

In [None]:
torch.save(net.state_dict(), 'Seq2Seq.pt')   