## 说明

这是一个基于seq2seq的，将中文句子翻译成英文的Encoder-Decoder模型

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from torchtext.legacy.data import Field, TabularDataset, Iterator, Example

import spacy
import numpy as np

import random
import math
import time

import pandas as pd
import csv

import os

In [2]:
# 使用tensorboard 生成可视化界面
# 会在项目目录下生成 runs/ 文件夹, 使用命令 tensorboard --logdir=./runs/ 即可查看图表

summaryWriter = SummaryWriter()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

In [3]:
print(torch.cuda.is_available())
print(torch.version.cuda)

True
11.1


## 数据集
使用的是paws-x数据集

https://github.com/google-research-datasets/paws/tree/master/pawsx

tsv使用 \t 作为分隔符，csv使用的是逗号

In [4]:
# 注：由于双引号在csv中有特殊含义，但语料数据中有些又有双引号，故需加上 quoting=csv.QUOTE_NONE 参数

# 训练数据集
if not os.path.exists(r'x-final\translated_zh2en_train.tsv'):
    src_zh_train = pd.read_csv(r'x-final\zh\translated_train.tsv', delimiter='\t', quoting=csv.QUOTE_NONE)
    trg_en_train = pd.read_csv(r'x-final\en\train.tsv', delimiter='\t', quoting=csv.QUOTE_NONE)

    # 将中文和英文的句子放在一个文件中便于后续处理
    src_zh_train["sentence3"] = trg_en_train['sentence1']
    src_zh_train["sentence4"] = trg_en_train['sentence2']
    src_zh_train = src_zh_train.dropna(how='any')[:49100]
    src_zh_train.to_csv(r'x-final\translated_zh2en_train.tsv', sep='\t', encoding='utf-8', index=False)

# 测试数据集
if not os.path.exists(r'x-final\translated_zh2en_test.tsv'):
    src_zh_test = pd.read_csv(r'x-final\zh\test_2k.tsv', delimiter='\t', quoting=csv.QUOTE_NONE)
    trg_en_test = pd.read_csv(r'x-final\en\test_2k.tsv', delimiter='\t', quoting=csv.QUOTE_NONE)
    src_zh_test["sentence3"] = trg_en_test['sentence1']
    src_zh_test["sentence4"] = trg_en_test['sentence2']
    src_zh_test = src_zh_test.dropna(how='any')
    src_zh_test.to_csv(r'x-final\translated_zh2en_test.tsv', sep='\t', encoding='utf-8', index=False)

### 分词器
例如 “我今天中午吃包子”，调用分词器就可以拆分得到 ['我','今天','中午','吃','包子']

对于英语来说，就直接是按空格分单词即可

为什么要分词？

因为对于一个句子来说，它是由词语而并非一个一个的汉字组成，例如上面例子中，若以汉字拆分就会将“天”这个字单独拆分出来，但事实上它必须和“今”组合在一起才能表示它在句中真正的意思

In [5]:
# 分词器
spacy_zh = spacy.load('zh_core_web_sm')
spacy_en = spacy.load('en_core_web_sm')
def tokenize_zh(text):
    tokens = list(map(lambda doc: doc.text, spacy_zh.tokenizer(text)))
    return tokens

def tokenize_en(text):
    tokens = list(map(lambda doc: doc.text, spacy_en.tokenizer(text)))
    return tokens

## 构建数据集对象

In [6]:
SRC = Field(tokenize=tokenize_zh, init_token='<sos>', eos_token='<eos>', lower=True)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
fields = [('id', None), ('sentence1', SRC), ('sentence2', None), ('label', None), ('sentence3', TRG), ('sentence4', None)]
train_dataset = TabularDataset(path=r'x-final\translated_zh2en_train.tsv', format='TSV',fields=fields, skip_header=True)
test_dataset = TabularDataset(path=r'x-final\translated_zh2en_test.tsv', format='TSV',fields=fields, skip_header=True)

In [7]:
len(train_dataset)

49100

### 构建词汇表

即将原本的单词或词语使用一个int的整数代替，便于后续处理

`min_freq = 2` 的意思是：只构建出现次数大于等于2的词汇，只出现一次的均使用`<unk>`(unknown)代替

构建之后，Field就有vocab属性了

In [8]:
SRC.build_vocab(train_dataset, min_freq = 2)
TRG.build_vocab(train_dataset, min_freq = 2)

In [9]:
print(f'src vocab len: {len(SRC.vocab)}')
print(f'trg vocab len: {len(TRG.vocab)}')

src vocab len: 36673
trg vocab len: 28660


## 构建模型
seq2seq模型，也是encoding-decoding 模型

### Encoder
Encoder的输入是整个句子，他会自动完成时序训练。最终输出一个向量（lstm是两个，但可以将它们合起来看成一个），该向量就可以看成是对整个句子的抽象，即：它里面包含了`整个`句子的特征信息

### Decoder
Decoder首先需要Encoder的输出作为一个输入（lstm是两个），另外还需要将每次上次时序的output作为下一次的一个输入

### Embedding
embedding层将原本的一个单词（事实上是单词的一个int类型的索引）转换为一个向量。

为什么要这样做？

一个单词或一个词语，例如“番茄”和“西红柿”，它俩的索引值肯定完全不同，但是可以通过训练得到一个相同的向量用以说明它们是同一个东西

nn.Embedding()的第一个参数是有多少单词需要转换，第二个参数是每个单词转换成多少维的向量

### Seq2Seq
需要说明的是：
1. 对于Encoder是直接将整个完整的句子（即拆分后的词组）传进去的，它自己内部会一个一个词地送给LSTM模型。但对于Decoder来说，我们就需要手动将输入一个一个遍历送进LSTM，因为Decoder的input是上一个时序的output
2. 上面说的“Decoder的input是上一个时序的output”，但实际训练过程中，会按照一定概率将“上一个时序的output”替换成ground truth，即真实的值（上一个时序的output是预测值）。这种做法叫做 `teacher force`

In [10]:
class Encoder(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.input_dim = input_dim
        self.embedding = nn.Embedding(num_embeddings=self.input_dim, embedding_dim=256)
        self.lstm = nn.LSTM(input_size=256, hidden_size=512, num_layers=1)
    def forward(self, x):
        x = self.embedding(x)
        output, (h_n, c_n) = self.lstm(x)
        return h_n, c_n

class Decoder(nn.Module):
    def __init__(self, output_dim):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(num_embeddings=self.output_dim, embedding_dim=256)
        self.lstm = nn.LSTM(input_size=256, hidden_size=512, num_layers=1)
        self.fc = nn.Linear(512, self.output_dim)
    def forward(self, x, h_n, c_n):
        # 对于Decoder来说，它的输入只是一个一个的单词（单个元素）而并非一整个句子（数组），故需要在外部增加一个维度，表示它是一个长度为1的句子
        x = x.unsqueeze(0)
        x = self.embedding(x)
        output, (h_n, c_n) = self.lstm(x, (h_n, c_n))
        
        output = output.squeeze(0)
        output = self.fc(output)
        return output, (h_n, c_n)
    
class TranslateModule(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, x, y):
        batch_size = y.shape[1]
        output_dim = self.decoder.output_dim
        
        res = torch.zeros(len(y), batch_size, output_dim).to(device) # some times mem out, depend on len(y) & batch_size
        
        h_n, c_n = self.encoder(x)
        next_word = y[0, :]
        for i in range(1, len(y)):
            output, (h_n, c_n) = self.decoder(next_word, h_n, c_n)
            res[i] = output
            next_word = output.argmax(1)
        return res

## 模型训练

In [11]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)

epoch = 5
# 11*3*3*499 = 49401
# 73*673 = 49129
batch_size = 50

encoder_model = Encoder(INPUT_DIM)
decoder_model = Decoder(OUTPUT_DIM)
model = TranslateModule(encoder_model, decoder_model).to(device)

optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()

In [12]:
count_train = 0
print(f'Start at {time.ctime()}')
for e in range(epoch):
    print(f'epoch: {e}')
    # 训练
    model.train()
    for idx, i in enumerate(Iterator(dataset=train_dataset, batch_size=batch_size)):
        count_train+=1
        optimizer.zero_grad()
        x = i.sentence1.to(device)
        y = i.sentence3.to(device)
        pred = model(x, y)
        loss = loss_fn(pred.permute(1,2,0), y.permute(1,0))
        summaryWriter.add_scalar(r'Loss/train', loss.item(), count_train)
        loss.backward()
        optimizer.step()
        if idx % int((len(train_dataset)/batch_size)/10) == 0:
            print(f'{time.ctime()} {count_train} of {int(len(train_dataset)/batch_size)}, cuda_mem: {torch.cuda.memory_allocated()}, max_cuda_mem: {torch.cuda.max_memory_allocated()}, reserved: {torch.cuda.memory_reserved()}')
    # 测试
    model.eval()
    with torch.no_grad():
        loss_sum = 0
        for idx, i in enumerate(Iterator(dataset=test_dataset, batch_size=batch_size)):
            x = i.sentence1.to(device)
            y = i.sentence3.to(device)
            pred = model(x, y)
            loss = loss_fn(pred.permute(1,2,0), y.permute(1,0))
            loss_sum += loss.item()
        summaryWriter.add_scalar(r'Loss/test', loss_sum/int((len(test_dataset)/batch_size)), e)

torch.save(model.state_dict(), 'translate.pth')
print(f'Done at {time.ctime()}')

Start at Sat Jul 16 11:10:07 2022
epoch: 0
Sat Jul 16 11:10:12 2022 1 of 982, cuda_mem: 737880064, max_cuda_mem: 921824768, reserved: 1151336448
Sat Jul 16 11:16:34 2022 99 of 982, cuda_mem: 737881600, max_cuda_mem: 1490021376, reserved: 2881486848
Sat Jul 16 11:22:52 2022 197 of 982, cuda_mem: 743612416, max_cuda_mem: 1490021376, reserved: 2881486848
Sat Jul 16 11:29:06 2022 295 of 982, cuda_mem: 743614464, max_cuda_mem: 1490021376, reserved: 2881486848
Sat Jul 16 11:35:02 2022 393 of 982, cuda_mem: 755079168, max_cuda_mem: 1490021376, reserved: 2881486848
Sat Jul 16 11:40:33 2022 491 of 982, cuda_mem: 749347328, max_cuda_mem: 1490021376, reserved: 2881486848
Sat Jul 16 11:46:00 2022 589 of 982, cuda_mem: 743611904, max_cuda_mem: 1490021376, reserved: 2881486848
Sat Jul 16 11:51:31 2022 687 of 982, cuda_mem: 755078144, max_cuda_mem: 1490021376, reserved: 2881486848
Sat Jul 16 11:57:04 2022 785 of 982, cuda_mem: 737880064, max_cuda_mem: 1510753280, reserved: 2074083328
Sat Jul 16 12:02

## 模型的使用

In [13]:
@torch.no_grad()
def translate(sentence, max_len):
    model.eval()
    
    # 分词并数字化
    sentence = torch.tensor(list(map(lambda w: SRC.vocab.stoi[w], tokenize_zh(sentence))))
    
    # 翻译
    sentence = sentence.unsqueeze(1).to(device)
    
    # 编码
    h_n, c_n = model.encoder(sentence)
    
    # 解码
    target = ""
    next_word = torch.tensor([SRC.vocab.stoi[SRC.init_token]]).to(device)
    for i in range(max_len):
        output, (h_n, c_n) = model.decoder(next_word, h_n, c_n)
        next_word = output.argmax(1)
        real_word = TRG.vocab.itos[next_word[0] if next_word[0] < OUTPUT_DIM else 0]
        target += (' ' + real_word)
        if real_word == TRG.eos_token:
            break
    return target

In [17]:
translate('Bouchier与Dorothy Britton结婚，后者将一些日本书籍翻译成英文。', 80)

' he married with in the , , with , , , , , " " " . . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'