In [2]:
import math  # 导入 math 模块，用于数学函数
import torchtext  # 导入 torchtext 库，用于文本处理工具
import torch  # 导入 PyTorch 深度学习框架
import torch.nn as nn  # 从 PyTorch 导入神经网络模块
from torch import Tensor  # 从 torch 导入 Tensor 类
from torch.nn.utils.rnn import pad_sequence  # 导入用于批量填充序列的函数
from torch.utils.data import DataLoader  # 从 PyTorch 导入 DataLoader 类，用于处理数据集
from collections import Counter  # 从 collections 模块导入 Counter 类，用于计数可哈希对象
from torchtext.vocab import Vocab  # 从 torchtext.vocab 导入 Vocab 类，用于处理词汇表
from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer  # 从 PyTorch 导入 Transformer 模型架构的类和函数
import io  # 导入 io 模块，用于处理流
import time  # 导入 time 模块，用于处理时间相关的函数
import pandas as pd  # 导入 pandas 库，用于数据处理和分析
import numpy as np  # 导入 numpy 库，用于数值操作
import pickle  # 导入 pickle 模块，用于序列化和反序列化 Python 对象
import tqdm  # 导入 tqdm 库，用于显示进度条
import sentencepiece as spm  # 导入 sentencepiece 库，用于分词
 
torch.manual_seed(0)  # 设置随机种子以便复现性
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # 检查是否有 GPU 可用，根据情况设置设备

In [3]:
device #检查设备是否为cuda

device(type='cuda')

In [4]:
# 读取数据集
df = pd.read_csv('zh-ja.bicleaner05.txt', sep='\\t', engine='python', header=None)
trainen = df[2].values.tolist() # 句子的英文部分存储在 trainen 列表中
trainja = df[3].values.tolist() # 句子的日文部分存储在 trainja 列表中
 
# 删除指定索引处的句子（第 5972 行）
trainen.pop(5972) 
trainja.pop(5972) 

'2014年と2017年のサンデータイムズ紙によってイギリス国内で生活に最も適した街と名付けられ、またヨーロッパグリーンキャピタルの賞も受賞しています。'

In [5]:
print(trainen[500])
# 打印中文数据
print(trainja[500])
# 打印日文数据

Chinese HS Code Harmonized Code System < HS编码 2905 无环醇及其卤化、磺化、硝化或亚硝化衍生物 HS Code List (Harmonized System Code) for US, UK, EU, China, India, France, Japan, Russia, Germany, Korea, Canada ...
Japanese HS Code Harmonized Code System < HSコード 2905 非環式アルコール並びにそのハロゲン化誘導体、スルホン化誘導体、ニトロ化誘導体及びニトロソ化誘導体 HS Code List (Harmonized System Code) for US, UK, EU, China, India, France, Japan, Russia, Germany, Korea, Canada ...


In [6]:
#加载中文分词模型文件
en_tokenizer = spm.SentencePieceProcessor(model_file='spm.en.nopretok.model') 
#加载日文分词模型文件
ja_tokenizer = spm.SentencePieceProcessor(model_file='spm.ja.nopretok.model') 

In [7]:
# 示例文本编码和解码
encoded_sentence = en_tokenizer.encode("All residents aged 20 to 59 years who live in Japan must enroll in public pension system.")
print(encoded_sentence)
decoded_sentence = en_tokenizer.decode(encoded_sentence)
print(decoded_sentence)
print('-'*50)
# 示例日文文本编码和解码
encoded_sentence = ja_tokenizer.encode("年金 日本に住んでいる20歳~60歳の全ての人は、公的年金制度に加入しなければなりません。")
print(encoded_sentence)
decoded_sentence = ja_tokenizer.decode(encoded_sentence)
print(decoded_sentence)

[335, 2728, 8692, 491, 7, 8446, 151, 87, 593, 11, 68, 167, 13797, 11, 412, 6895, 98, 5]
All residents aged 20 to 59 years who live in Japan must enroll in public pension system.
--------------------------------------------------
[4, 6866, 714, 12628, 210, 1550, 306, 1077, 5231, 1092, 830, 3, 7503, 6866, 786, 10, 8556, 4600, 5]
年金 日本に住んでいる20歳~60歳の全ての人は、公的年金制度に加入しなければなりません。


In [9]:
print(torchtext.__version__)

0.6.0


In [11]:
# 构建词汇表
def build_vocab(sentences, tokenizer):
    counter = Counter()
    for sentence in sentences:
        counter.update(tokenizer.encode(sentence, out_type=str))
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
# 使用 build_vocab 函数分别为日语 (trainja) 和英语 (trainen) 句子构建词汇表。
ja_vocab = build_vocab(trainja, ja_tokenizer)
en_vocab = build_vocab(trainen, en_tokenizer)

In [12]:
# 数据处理函数，将文本转换为张量形式
def data_process(ja, en):
    data = []
    for (raw_ja, raw_en) in zip(ja, en):
        ja_tensor_ = torch.tensor([ja_vocab[token] for token in ja_tokenizer.encode(raw_ja.rstrip("\n"), out_type=str)],
                            dtype=torch.long)
        en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer.encode(raw_en.rstrip("\n"), out_type=str)],
                            dtype=torch.long)
        data.append((ja_tensor_, en_tensor_))
    return data
# 使用 data_process 函数处理训练数据 (trainja 和 trainen)，并将处理后的数据存储在 train_data 中。
train_data = data_process(trainja, trainen)

In [13]:
# 设置批处理大小和填充索引
BATCH_SIZE = 8
PAD_IDX = ja_vocab['<pad>']
BOS_IDX = ja_vocab['<bos>']
EOS_IDX = ja_vocab['<eos>']
 
# 生成批处理数据函数
def generate_batch(data_batch):
    ja_batch, en_batch = [], []
    for (ja_item, en_item) in data_batch:
        # 在每个句子的开始和结尾添加特殊标记
        ja_batch.append(torch.cat([torch.tensor([BOS_IDX]), ja_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    # 使用pad_sequence对批次进行填充
    ja_batch = pad_sequence(ja_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return ja_batch, en_batch
# 创建数据加载器
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,shuffle=True, collate_fn=generate_batch)

# Transformer模型
Transformer 是“Attention is all you need”论文中介绍的用于解决机器翻译任务的 Seq2Seq 模型。Transformer 模型由一个编码器和解码器块组成，每个块包含固定数量的层.

In [14]:
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)
 
 
# 定义Seq2SeqTransformer模型
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        
        # 创建Transformer编码器和解码器层
        encoder_layer = TransformerEncoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        decoder_layer = TransformerDecoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
 
        # 生成器层
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        
        # 源和目标标记嵌入
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        
        # 位置编码层
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)
 
    def forward(self, src: Tensor, tgt: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        
        # 编码器和解码器的前向传播
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)
 
    def encode(self, src: Tensor, src_mask: Tensor):
        # 编码器的前向传播
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)
 
    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        # 解码器的前向传播
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [15]:
# 定义位置编码器类
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        # 初始化位置编码矩阵
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)# 计算位置编码的分母
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))# 初始化位置编码矩阵
        pos_embedding[:, 0::2] = torch.sin(pos * den)# 在偶数索引位置计算sin函数
        pos_embedding[:, 1::2] = torch.cos(pos * den)# 在奇数索引位置计算cos函数
        pos_embedding = pos_embedding.unsqueeze(-2)
 
        self.dropout = nn.Dropout(dropout)  # 定义Dropout层
        self.register_buffer('pos_embedding', pos_embedding)
 
    def forward(self, token_embedding: Tensor):
         # 将位置编码加入到输入的嵌入向量中，并应用Dropout
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
 
 
# 定义标记嵌入器类
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
 
    def forward(self, tokens: Tensor):
        # 获取标记的嵌入表示并缩放
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [16]:
# 生成方形的后续掩码函数
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
 
# 创建掩码函数
def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]
 
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
 
    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [17]:
# 创建Seq2SeqTransformer模型实例
SRC_VOCAB_SIZE = len(ja_vocab)
TGT_VOCAB_SIZE = len(en_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 16
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
NUM_EPOCHS = 16
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)
 
for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
 
transformer = transformer.to(device)
 
# 定义损失函数和优化器
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
 
optimizer = torch.optim.Adam(
    transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9
)
 
# 训练一个epoch的函数
def train_epoch(model, train_iter, optimizer):
    model.train()
    losses = 0
    for idx, (src, tgt) in  enumerate(train_iter):
        src = src.to(device)
        tgt = tgt.to(device)
 
        tgt_input = tgt[:-1, :]
 
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
 
        logits = model(src, tgt_input, src_mask, tgt_mask,
                                src_padding_mask, tgt_padding_mask, src_padding_mask)
 
        optimizer.zero_grad()
 
        tgt_out = tgt[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()
 
        optimizer.step()
        losses += loss.item()
    return losses / len(train_iter)
 
# 评估函数
def evaluate(model, val_iter):
    model.eval()
    losses = 0
    for idx, (src, tgt) in (enumerate(valid_iter)):
        src = src.to(device)
        tgt = tgt.to(device)
 
        tgt_input = tgt[:-1, :]
 
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
 
        logits = model(src, tgt_input, src_mask, tgt_mask,
                              src_padding_mask, tgt_padding_mask, src_padding_mask)
        tgt_out = tgt[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(val_iter)



In [19]:
# 训练模型
for epoch in tqdm.tqdm(range(1, NUM_EPOCHS+1)):
    start_time = time.time()
    train_loss = train_epoch(transformer, train_iter, optimizer)
    end_time = time.time()
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, "
          f"Epoch time = {(end_time - start_time):.3f}s"))

  6%|▋         | 1/16 [04:35<1:08:54, 275.64s/it]

Epoch: 1, Train loss: 3.737, Epoch time = 275.638s


 12%|█▎        | 2/16 [09:11<1:04:19, 275.66s/it]

Epoch: 2, Train loss: 3.237, Epoch time = 275.676s


 19%|█▉        | 3/16 [13:46<59:42, 275.60s/it]  

Epoch: 3, Train loss: 2.902, Epoch time = 275.522s


 25%|██▌       | 4/16 [18:22<55:05, 275.44s/it]

Epoch: 4, Train loss: 2.652, Epoch time = 275.202s


 31%|███▏      | 5/16 [22:56<50:27, 275.19s/it]

Epoch: 5, Train loss: 2.464, Epoch time = 274.730s


 38%|███▊      | 6/16 [27:31<45:51, 275.20s/it]

Epoch: 6, Train loss: 2.332, Epoch time = 275.215s


 44%|████▍     | 7/16 [32:06<41:15, 275.06s/it]

Epoch: 7, Train loss: 2.237, Epoch time = 274.774s


 50%|█████     | 8/16 [36:40<36:37, 274.71s/it]

Epoch: 8, Train loss: 2.147, Epoch time = 273.961s


 56%|█████▋    | 9/16 [41:13<31:58, 274.10s/it]

Epoch: 9, Train loss: 2.070, Epoch time = 272.750s


 62%|██████▎   | 10/16 [45:46<27:22, 273.71s/it]

Epoch: 10, Train loss: 2.004, Epoch time = 272.851s


 69%|██████▉   | 11/16 [50:19<22:48, 273.68s/it]

Epoch: 11, Train loss: 1.950, Epoch time = 273.592s


 75%|███████▌  | 12/16 [54:53<18:14, 273.51s/it]

Epoch: 12, Train loss: 1.901, Epoch time = 273.125s


 81%|████████▏ | 13/16 [59:25<13:39, 273.26s/it]

Epoch: 13, Train loss: 1.856, Epoch time = 272.683s


 88%|████████▊ | 14/16 [1:03:58<09:06, 273.04s/it]

Epoch: 14, Train loss: 1.818, Epoch time = 272.549s


 94%|█████████▍| 15/16 [1:08:30<04:32, 272.89s/it]

Epoch: 15, Train loss: 1.781, Epoch time = 272.545s


100%|██████████| 16/16 [1:13:03<00:00, 273.96s/it]

Epoch: 16, Train loss: 1.748, Epoch time = 272.553s





In [20]:
# 贪婪解码函数
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys
 
# 翻译函数
def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
    model.eval()
    tokens = [BOS_IDX] + [src_vocab.stoi[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join([tgt_vocab.itos[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")

In [23]:
# 进行翻译
translate(transformer, "HSコード 8515 はんだ付け用、ろう付け用又は溶接用の機器(電気式(電気加熱ガス式を含む。)", ja_vocab, en_vocab, ja_tokenizer)
 
# 移除trainch列表中索引为5的元素（删除第6个元素）
#trainch.pop(5)
# 移除trainja列表中索引为5的元素（删除第6个元素）
#.pop(5)

' ▁HS 编 码 ▁85 15 ▁ 电 气 式 ( 包 括 电 气 加 热 气 体 ) 。 '

In [24]:
import pickle
# open a file, where you want to store the data
file = open('en_vocab.pkl', 'wb')
# dump information to that file
pickle.dump(en_vocab, file)
file.close()
file = open('ja_vocab.pkl', 'wb')
pickle.dump(ja_vocab, file)
file.close()

In [25]:
# 保存模型用于推理
torch.save(transformer.state_dict(), 'inference_model')

In [26]:
# 保存模型和检查点以便稍后恢复训练
torch.save({
  'epoch': NUM_EPOCHS,
  'model_state_dict': transformer.state_dict(),
  'optimizer_state_dict': optimizer.state_dict(),
  'loss': train_loss,
  }, 'model_checkpoint.tar')