In [None]:
import transformers
from tqdm import tqdm
import math
import torch
import torch.nn as nn
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

zh_tokenizer = transformers.AutoTokenizer.from_pretrained("hfl/chinese-xlnet-base")
en_tokenizer = transformers.AutoTokenizer.from_pretrained("xlnet-base-cased")

In [None]:
zh_tokenizer.convert_tokens_to_ids("<pad>"), en_tokenizer.convert_tokens_to_ids("<pad>")

In [4]:
class EntityDataset:
    def __init__(self, src, trg):
        self.src = src
        self.trg = trg
        self.MAX_LEN = 128
    
    def __len__(self):
        return len(self.src)
    
    def __getitem__(self, item):
        src = self.src[item]
        trg = self.trg[item]

        mask = [1] * len(src)

        src_padding_len = self.MAX_LEN - len(src)
        trg_padding_len = self.MAX_LEN - len(trg)

        src = src + ([5] * src_padding_len) 
        trg = trg + ([5] * trg_padding_len)

        return {
            "src": torch.tensor(src, dtype=torch.long),
            "trg": torch.tensor(trg, dtype=torch.long),
        }

def get_data_loader(en_path, zh_path):
    src_ids = []
    trg_ids = []

    with open(en_path) as src:
        data = src.readlines()
        for ids in tqdm(data):
            ids_list = ids.split(" ")[:-1]
            ids_list = [int(ids_list[i]) for i in range(len(ids_list))]
            src_ids.append(ids_list)

    with open(zh_path) as src:
        data = src.readlines()
        for ids in tqdm(data):
            ids_list = ids.split(" ")[:-1]
            ids_list = [int(ids_list[i]) for i in range(len(ids_list))]
            trg_ids.append(ids_list)
    return src_ids, trg_ids

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, max_len):
        super().__init__()
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) # 1000^(2i/dim_model)
        
        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        
        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)
        
    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        return self.pos_encoding[:token_embedding.size(0), :]

In [6]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = PositionalEncoding(embedding_size, max_len)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = PositionalEncoding(embedding_size, max_len)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        word_src = self.src_word_embedding(src)
        word_trg = self.trg_word_embedding(trg)

        embed_src = self.dropout((word_src + self.src_position_embedding(src)))
        embed_trg = self.dropout((word_trg + self.trg_position_embedding(trg)))

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out


In [7]:
train_en, train_zh = get_data_loader("/content/drive/MyDrive/NLP/MT/train.en", "/content/drive/MyDrive/NLP/MT/train.zh")
valid_en, valid_zh = get_data_loader("/content/drive/MyDrive/NLP/MT/valid.en", "/content/drive/MyDrive/NLP/MT/valid.zh")
test_en, test_zh = get_data_loader("/content/drive/MyDrive/NLP/MT/test.en", "/content/drive/MyDrive/NLP/MT/test.zh")

100%|██████████| 279916/279916 [00:02<00:00, 104752.96it/s]
100%|██████████| 279916/279916 [00:03<00:00, 87830.04it/s]
100%|██████████| 15512/15512 [00:00<00:00, 59045.81it/s]
100%|██████████| 15512/15512 [00:00<00:00, 51645.07it/s]
100%|██████████| 15626/15626 [00:00<00:00, 54216.89it/s]
100%|██████████| 15626/15626 [00:00<00:00, 63505.97it/s]


In [8]:
Batch_size = 32

train_dataset = EntityDataset(train_en, train_zh)
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=Batch_size)

valid_dataset = EntityDataset(valid_en, valid_zh)
valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=Batch_size)

test_dataset = EntityDataset(test_en, test_zh)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=Batch_size)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 32
num_epochs = 20
learning_rate = 5e-4

src_vocab_size = len(en_tokenizer.vocab)
trg_vocab_size = len(zh_tokenizer.vocab)
embedding_size = 512
num_heads = 4
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.30
max_len = 128
forward_expansion = 1024
src_pad_idx = en_tokenizer.convert_tokens_to_ids("<pad>")


model = Transformer(
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
).to(device)

In [None]:
learning_rate = 5e-4

num_train_steps = int(
    len(train_dataset) / batch_size * num_epochs
)

optimizer = AdamW(model.parameters(), lr=learning_rate)

trg_pad_idx = zh_tokenizer.convert_tokens_to_ids("<pad>")
criterion = nn.CrossEntropyLoss(ignore_index = trg_pad_idx, label_smoothing=0.1)

scheduler = get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=135, 
    num_training_steps=num_train_steps)



In [None]:
import numpy as np
best_loss = np.inf

for epoch in range(num_epochs):
    print("Epoch {}/{}:".format(epoch+1, num_epochs))

    model.train()
    train_losses = []
    final_loss = 0
    for batch_idx, batch in enumerate(tqdm(train_data_loader)):
        inp_data = batch["src"].permute(1,0).to(device)
        target = batch["trg"].permute(1,0).to(device)
        output = model(inp_data, target[:-1, :]) 
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        
        optimizer.zero_grad()
        loss = criterion(output, target)
        loss.backward()        
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()
    train_loss = final_loss / len(train_data_loader)
    train_losses.append(train_loss)
    

    model.eval()
    valid_losses = []
    final_loss = 0
    for batch_idx, batch in enumerate(tqdm(valid_data_loader)):
        inp_data = batch["src"].permute(1,0).to(device)
        target = batch["trg"].permute(1,0).to(device)
        output = model(inp_data, target[:-1, :]) 
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        loss = criterion(output, target)
        final_loss += loss.item()
    valid_loss = final_loss / len(valid_data_loader)
    valid_losses.append(valid_loss)

    print("train loss: {},  valid_loss: {}".format(train_loss, valid_loss))

    if valid_loss < best_loss:
        torch.save(model.state_dict(), "model.bin")
        best_loss = valid_loss

In [49]:
# Generate

model.eval()

test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1)

sgd_mt = open("sgd_mt.txt", "w")

for k, batch in enumerate(tqdm(test_data_loader)):
  #batch = next(iter(test_data_loader))

  inp_data = batch["src"].permute(1,0).to(device)
  outputs = [zh_tokenizer.convert_tokens_to_ids("<cls>")]

  for i in range(128):
      trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

      with torch.no_grad():
          output = model(inp_data, trg_tensor)

      best_guess = output.argmax(2)[-1, :].item()
      outputs.append(best_guess)

      if best_guess == zh_tokenizer.convert_tokens_to_ids("<sep>"):
          break

  translated_sentence = zh_tokenizer.decode(outputs[1:-1])
  sgd_mt.write(translated_sentence)
  sgd_mt.write("\n")

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

src_vocab_size = len(en_tokenizer.vocab)
trg_vocab_size = len(zh_tokenizer.vocab)
embedding_size = 512
num_heads = 4
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.3
max_len = 128
forward_expansion = 1024
src_pad_idx = en_tokenizer.convert_tokens_to_ids("<pad>")


model = Transformer(
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
).to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/NLP/MT/model.bin'))

<All keys matched successfully>

In [74]:
def inference(infer_sentence, model):
  infer_sentence = [3] + en_tokenizer.convert_tokens_to_ids(en_tokenizer.tokenize(infer_sentence)) + [4] 
  infer_dataset = EntityDataset([infer_sentence], [infer_sentence])
  infer_data_loader = torch.utils.data.DataLoader(infer_dataset, batch_size=1)

  model.eval()
  batch = next(iter(infer_data_loader))
  inp_data = batch["src"].permute(1,0).to(device)
  outputs = [zh_tokenizer.convert_tokens_to_ids("<cls>")]

  for i in range(128):
      trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

      with torch.no_grad():
          output = model(inp_data, trg_tensor)

      best_guess = output.argmax(2)[-1, :].item()
      outputs.append(best_guess)

      if best_guess == zh_tokenizer.convert_tokens_to_ids("<sep>"):
          break

  translated_sentence = zh_tokenizer.decode(outputs[1:-1])
  return translated_sentence

In [71]:
zh_tokenizer.decode(o[1:-1])

'在福克斯新闻发布后,特朗普的推特开始。 '

In [75]:
en1 = """After a Fox News report, Mr. Trump’s tweets began."""
zh1 = "在福克斯新闻报道后，特朗普先生的推文开始了。"
en2 = "Paris - As the economic crisis deepens and widens, the world has been searching for historical analogies to help us understand what has been happening."
zh2 = "巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。"
en3 = "in 1989, liberal democracy triumphed over the socialist ideology incarnated and promoted by the Soviet Bloc."
zh3 = "1989年，自由民主战胜了由苏联集团具体化并推崇的社会主义意识形态。"
ZH1 = inference(en1, model)
ZH2 = inference(en2, model)
ZH3 = inference(en3, model)

def show_mt(zh, ZH):
  print("原始句子: {}".format(zh))
  print("机器翻译: {}".format(ZH))
  print("\n")

show_mt(zh1, ZH1)
show_mt(zh2, ZH2)
show_mt(zh3, ZH3)

原始句子: 在福克斯新闻报道后，特朗普先生的推文开始了。
机器翻译: 在福克斯新闻发布后,特朗普的推特开始。 


原始句子: 巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。
机器翻译: 巴黎 - 随着经济危机深化和扩大,世界一直在寻找历史类图帮助我们知道发生了什么。 


原始句子: 1989年，自由民主战胜了由苏联集团具体化并推崇的社会主义意识形态。
机器翻译: 1989年,自由民主胜利在苏联解体所推动的社会主义意识形态上获胜。 


