# **载入包**

In [1]:
import math
import time
from collections import Counter
import random
import jieba
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence,pad_packed_sequence

# **设置随机种子**

In [2]:
SEED = 1234
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1f4d96b1e10>

# **设定超参数**

In [3]:
BATCH_SIZE = 4
MAX_VOCAB_SIZE = 30000
PAD_IDX = 0
UNK_IDX = 1
EMB_DIM = 256
HIDDEN_SIZE = 512
DROPOUT = 0.5
EPOCHS = 2

# **加载数据**

In [5]:
def load_data(in_file):
    contexts = []
    targets = []
    with open(in_file, 'r', encoding='utf-8') as f:
        for line in f:
            context, target = line.strip().split('\t') #context 和target是string
            context_list = context.split()
            target_list = list(jieba.cut(target, cut_all=False, HMM=True))
            contexts.append(["<bos>"] + context_list + ["<eos>"])
            targets.append(["<bos>"] + target_list + ["<eos>"])
    return contexts,targets

In [6]:
file_path = "C:/Users/Administrator.DESKTOP-10M2D22/Desktop/nlp-beginner/cmn.txt"
contexts,targets= load_data(file_path)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1.DES\AppData\Local\Temp\jieba.cache
Loading model cost 0.768 seconds.
Prefix dict has been built succesfully.


# **构建单词表**

In [8]:
def build_voc(sentences, max_words=MAX_VOCAB_SIZE):
    word_count = Counter()
    for sentence in sentences:
        for s in sentence:
            word_count[s] += 1
    vocab_dict = dict(word_count.most_common(max_words))
    #most_common(n) 按照counter的计数，按照降序，返回前max_words项组成的list，比总单词数大时返回全部项。
    total_wds = len(vocab_dict)+2                #加上<unk>和<pad>
    id2wd = [w for w in vocab_dict.keys()]       #单词构成的list
    wd2id = {}
    for i,w in enumerate(id2wd):
        wd2id[w] = i + 2                              #单词:id +2 构成的dict
    wd2id["<unk>"] = UNK_IDX
    wd2id["<pad>"] = PAD_IDX
    id2wd = id2wd+["<unk>","<pad>"]
    return id2wd,wd2id,total_wds

In [9]:
id2en,en2id,total_en = build_voc(contexts)
id2cn,cn2id,total_cn = build_voc(targets)

# **Dataloader**

In [10]:
class MyDataset(Dataset):
    def __init__(self, contexts, targets, en2id, cn2id, sort_by_len=True):
        super().__init__()
        self.en2id = en2id
        self.cn2id = cn2id
        self.sort_by_len = sort_by_len

        self.contexts = [torch.LongTensor([en2id.get(w, 0) for w in sent]) for sent in contexts]
        self.targets = [torch.LongTensor([cn2id.get(w, 0) for w in sent]) for sent in targets]

        if self.sort_by_len:
            self.sorted_idx = sorted(range(len(self.contexts)), key=lambda x: len(self.contexts[x]))
            self.contexts = [self.contexts[i] for i in self.sorted_idx]
            self.targets = [self.targets[i] for i in self.sorted_idx]

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self,idx):
        context = self.contexts[idx]
        target = self.targets[idx]
        context_len = torch.numel(self.contexts[idx])
        target_len = torch.numel(self.targets[idx])
        return {'context': context, 'target': target, 'context_len': context_len, 'target_len': target_len}

In [11]:
def my_collate_fn(batch):
    #the input（batch）to collate_fn is a list of with size batch_size.Each element is a dict where elements in order are context,target,context_len,target_len.
    res_batch = {}
    for key in batch[0]:
        batch_tensor = [d[key] for d in batch]
        if key == "context" or key =="target":
            batch_tensor = pad_sequence(batch_tensor,batch_first=True)
        res_batch[key] = batch_tensor
    return res_batch

In [12]:
full_dataset = MyDataset(contexts,targets,en2id,cn2id)
train_size = int(0.6 * len(full_dataset))
valid_size = int(0.2 * len(full_dataset))
test_size = len(full_dataset) - train_size - valid_size
train_dataset,valid_dataset,test_dataset = torch.utils.data.random_split(full_dataset, [train_size,valid_size,test_size])
train_dataloader = DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=my_collate_fn)
valid_dataloader = DataLoader(dataset=valid_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=my_collate_fn)
test_dataloader = DataLoader(dataset=test_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=my_collate_fn)

# **Encoder**

In [13]:
class Encoder(nn.Module):
    def __init__(self, num_emb, emb_dim, hidden_size, dropout):
        super().__init__()
        self.num_emb = num_emb
        self.emb_dim = emb_dim
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(num_emb, emb_dim) #词表大小，嵌入维度
        self.rnn = nn.GRU(emb_dim,hidden_size,batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x,x_lengths):
        # x = [batch_size,batch_len]
        # print(x.size())
        # x_lengths is a list of context_len
        embedded = self.dropout(self.embedding(x)) # LongTensor of arbitrary shape containing the indices to extract
        # embdded = [batch_size, batch_len,emb_dim]

        packed_embedded = pack_padded_sequence(embedded,x_lengths,batch_first=True,enforce_sorted=False)
        # input(Tensor):padded batch of variable length sequences.lengths(list):list of sequences lengths of each batch element.
        packed_outputs, hidden = self.rnn(packed_embedded)
        # hidden = [num_layers * num_directions,batch_size,hidden_size]
        hidden = hidden.permute(1,0,2) #返回的是batch_size是在第二位，必须手动调整到第一位
        # hidden = [batch_size,num_layers * num_directions,hidden_size]
        outputs, _ = pad_packed_sequence(packed_outputs)
        # outputs = [batch_len,batch_size,num_directions * hidden_size]
        outputs = outputs.permute(1,0,2) #注意这里的batch是在第二位，必须手动调整到第一位
        # outputs = [batch_size,batch_len,num_directions * hidden_size]
        '''关于pack_padded_sequence和pad_packed_sequence函数，可以参考https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html
        官方文档，以及两个函数的源码，写的比较清楚。'''
        return outputs, hidden[:,-1,:] #我们只需要最后一个time step的最上层的hidden, [batch_size,hidden_size]

# **Attention**

In [14]:
class Attention(nn.Module):
    def __init__(self,hidden_size):
        super().__init__()
        self.attn = nn.Linear(hidden_size+hidden_size,hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self,hidden,encoder_outputs):
        # encoder_outputs = [batch_size,batch_len, hidden_size] 这个outputs是encoder每一个time step的hiddens
        # hidden = [batch_size,hidden_size] 这个hidden是从encoder最后一个time step的hidden，传递过来的，是作为decoder的hidden_0
        batch_size = encoder_outputs.shape[0]
        batch_len = encoder_outputs.shape[1]
        hidden = hidden.repeat(1,batch_len,1)
        # hidden = [batch_size,batch_len,hidden_size]
        tmp = torch.cat((hidden,encoder_outputs),dim = 2)
        energy = torch.tanh(self.attn(torch.cat((hidden,encoder_outputs),dim = 2)))
        # energy = [batch_size,batch_len,hidden_size]
        energy = energy.permute(0,2,1)
        # energy = [batch_size,hidden_size,batch_len]
        # self.v = [hidden_size]
        v =self.v.repeat(batch_size,1).unsqueeze(1)
        # v = [batch_size,1,hidden_size]
        attention = torch.bmm(v,energy)
        # attention = [batch_size,1,batch_len]
        attention = attention.squeeze(1)
        # attention = [batch_size,batch_len]
        return F.softmax(attention,dim=1) # This gives us the attention over the source sentence

# **Decoder**

In [15]:
class Decoder(nn.Module): #decoder层包含了attention 层
    def __init__(self, num_emb, emb_dim, hidden_size, dropout,attention):
        super().__init__()
        self.num_emb = num_emb
        self.emb_dim = emb_dim
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(dropout)
        self.attention = attention

        self.embedding = nn.Embedding(num_emb, emb_dim)
        self.rnn = nn.GRU(emb_dim + hidden_size, hidden_size, batch_first=True) #这个batch_first只能要求input的batch_size在第一位
        self.out = nn.Linear(emb_dim + hidden_size + hidden_size, num_emb)

    def forward(self,y_0,h_0,encoder_outputs): #h_0是encoder的最后一个time step的hidden，outputs是encoder每一个time step的hiddens
        # y_0 = [batch_size]
        # h_0 = [batchs_size,hidden_size]
        # encoder_outputs = [batch_size,batch_len,hidden_size]
        y_0 = y_0.unsqueeze(1)
        h_0 = h_0.unsqueeze(1)
        # y_0 = [batch_size,1]
        # h_0 = [batch_size,1,hidden_size]

        a = self.attention(h_0,encoder_outputs).unsqueeze(1)
        # a = [batch_size,1,batch_len]
        weighted = torch.bmm(a,encoder_outputs) # a weighted source vector
        # weighted = [batch_size,1,hidden_size]
        embedded = self.dropout(self.embedding(y_0))
        # embdded = [batch_size,1,emb_dim]
        rnn_input = torch.cat((embedded,weighted),dim = -1)
        # rnn_input = [batch_size,1,emb_dim + hidden_size]
        h_0 = h_0.permute(1,0,2)
        # h_0 = [1,batchs_size,hidden_size]
        output, hidden = self.rnn(rnn_input,h_0)
        hidden = hidden.permute(1,0,2)
        # hidden = [batch_size,1,hidden_size]
        # output = [batch_size,1, hidden_size]
        output = torch.cat((embedded.squeeze(1),hidden.squeeze(1),weighted.squeeze(1)),dim = 1)
        # output = [batch_size.emb_dim+hidden_size+hidden_size]
        prediction = self.out(output)
        # prediction = [batch_size,num_emb]
        return prediction, hidden[:,-1,:] #去除降维变成 hidden[:,-1,:] = [batch_size,hidden_size]

# **Seq2Seq**

In [16]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self,x,x_lengths,y):
        # x = [batch_size,context_len]
        # x_lengths is a list of context_len
        # y = [batch_size,target_len]
        encoder_outputs, hidden = self.encoder(x, x_lengths)  # 进行encoder

        target_len = y.shape[-1]
        batch_size = y.shape[0]
        outputs = torch.zeros(batch_size,target_len,self.decoder.num_emb)
        # outputs = [batch_size,target_len,num_emb]
        y_0 = y[:,0]
        # y_0 =[batch_size]
        h_0 = hidden
        # h_0 = [batch_size,hidden_size]
        for t in range(1, target_len):
            prediction, hidden = self.decoder(y_0,h_0,encoder_outputs) #decoder传入的参数h_0是来自encoder的hidden
            # prediction = [batch_size,num_emb]
            outputs[:,t,:] = prediction
            y_0= y[:,t]
        return outputs,None
        # outputs = [batch_size,target_len,num_emb]

# **初始化**

In [17]:
encoder = Encoder(num_emb = total_en, emb_dim = EMB_DIM, hidden_size = HIDDEN_SIZE, dropout = DROPOUT)
attn = Attention(HIDDEN_SIZE)
decoder = Decoder(num_emb = total_cn, emb_dim = EMB_DIM, hidden_size = HIDDEN_SIZE, dropout = DROPOUT,attention=attn)
model = Seq2Seq(encoder,decoder)
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)
optimizer = torch.optim.Adam(model.parameters())

# **定义训练**

In [18]:
def train(model,dataloader,optimizer,criterion):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(dataloader):
        prediction, attn = model(batch["context"],batch["context_len"],batch["target"])
        # [batch_size,context_len],x_lengths is a list of context_len, [batch_size,target_len]
        target = batch["target"]
        # target = [batch_size,target_len]
        # prediction = [batch_size,target_len，num_emb]
        target =target.contiguous()
        target = target[:,1:].reshape(-1)
        prediction = prediction[:,1:,:].reshape(-1,prediction.shape[-1])
        # target = [batch_size*(target_len-1)]
        # prediction = [batch_size*(target_len-1),num_emb] #不要第一个<bos>
        loss = criterion(prediction, target)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

# **定义测试**

In [19]:
def evaluate(model,dataloader,criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            prediction, attn = model(batch["context"],batch["context_len"],batch["target"])
            # [batch_size,context_len],x_lengths is a list of context_len, [batch_size,target_len]
            target = batch["target"]
            # target = [batch_size,target_len]
            # prediction = [batch_size,target_len，num_emb]
            target =target.contiguous()
            target = target[:,1:].reshape(-1)
            prediction = prediction[:,1:,:].reshape(-1,prediction.shape[-1])
            # target = [batch_size*(target_len-1)]
            # prediction = [batch_size*(target_len-1),num_emb] #不要第一个<bos>
            loss = criterion(prediction, target)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

# **每个epoch所用时间**

In [25]:
def epoch_time(start_time,end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins*60))
    return elapsed_mins,elapsed_secs

In [None]:
best_valid_loss = float("inf")
for epoch in range(EPOCHS):
    start_time = time.time()
    train_loss = train(model,train_dataloader,optimizer,criterion)
    valid_loss = evaluate(model, valid_dataloader,criterion)
    end_time = time.time()
    epoch_mins,epoch_secs = epoch_time(start_time,end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(),"./model.pt")

    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [None]:
model.load_state_dict(torch.load('model.pt'))
test_loss = evaluate(model, test_dataloader, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')