In [None]:
import sys
sys.path.append("..")
import torch
import numpy as np
from tqdm import tqdm
import os
from Utils import VariantWordDataset, Config
from torch.utils.data import DataLoader
from torch import nn

## 1. 加载数据

In [None]:
# 实例化全局参数 Config 对象
config = Config()


# 构建数据集
train_set = VariantWordDataset("train", config.source_dic_path, config.target_dic_path)
valid_set = VariantWordDataset("test", config.source_dic_path, config.target_dic_path)
print(f"Train size: {len(train_set)}")

# 构建data_loader
n_cpu = os.cpu_count()
train_dataloader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True, collate_fn=train_set.generate_batch, num_workers=n_cpu)
valid_dataloader = DataLoader(valid_set, batch_size=config.batch_size, shuffle=False, collate_fn=valid_set.generate_batch, num_workers=n_cpu)

## 2. 模型训练

In [None]:
from Model.BaselineModel import BaselineModel
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
# import wandb

# 初始化 wandb
# wandb.init(project="Graduation_project")

# 模型初始化
model = BaselineModel(config)

# # wandb logger
# wandb_logger = WandbLogger(project = "Graduation_project",
#                            name = 'Transformer-CrossEL-lr-0.02',
#                            save_dir = '../Logs',
#                            log_model="all")

# # 模型参数保存
# checkpoint_callback = ModelCheckpoint(
#     monitor="valid_accuracy",
#     dirpath="../Weights",
#     filename="Baseline-Transformer-CrossEntropyLoss-{epoch:02d}-{valid_accuracy:.2f}",
#     save_top_k=3,
#     mode="max",
# )

# # 训练 Trainer 定义
# trainer = pl.Trainer(
#     max_epochs=5, 
#     gpus=0,
#     logger = wandb_logger,
#     callbacks=[checkpoint_callback]
#     )


# # 模型训练
# trainer.fit(
#     model, 
#     train_dataloaders=train_dataloader, 
#     val_dataloaders=valid_dataloader
# )

## 模型预测

In [None]:
import sys
sys.path.append("../../")
sys.path.append("..")
from Model.ConvS2SModel import ConvS2SModel
from Utils.Variant_word import VariantWordDataset
import torch
from Utils.config import Config
from torchtext.data.metrics import bleu_score

# def translate(model, src, data_loader, config):
    
#     source_dic = data_loader.source_dic    
#     target_dic = data_loader.target_dic

#     model.eval()

#     tokens = [source_dic.word2idx[i] for i in list(src)] # 构造一个样本
#     num_tokens = len(tokens)
#     src = (torch.LongTensor(tokens).reshape(num_tokens, 1))  # 将src_len 作为第一个维度
#     tgt_tokens = greedy_decode(model, src, max_len=num_tokens + 5,
#                                 start_symbol=config.BOS_IDX, config=config).flatten()  # 解码的预测结果
    
#     return " ".join([target_dic.idx2word[int(tok)] for tok in tgt_tokens]).replace("[BOS]", "").replace("[EOS]", "")


# def greedy_decode(model, src, max_len, start_symbol, config):

#     src = src.to(config.device)
#     memory = model.encoder(src)  # 对输入的Token序列进行解码翻译
#     ys = torch.ones(1, 1).fill_(start_symbol). \
#        type(torch.long).to(config.device)  # 解码的第一个输入，起始符号

#     for i in range(max_len - 1):
#         memory = memory.to(config.device)
#         tgt_mask = (model.my_transformer.generate_square_subsequent_mask(ys.size(0))
#                    .type(torch.bool)).to(config.device)  # 根据tgt_len产生一个注意力mask矩阵（对称的）
#         out = model.decoder(ys, memory, tgt_mask)  # [tgt_len,tgt_vocab_size]
#         out = out.transpose(0, 1)  # [tgt_vocab_size, tgt_len]
#         prob = model.classification(out[:, -1])  # 只对对预测的下一个词进行分类
#         _, next_word = torch.max(prob, dim=1)  # 选择概率最大者
#         next_word = next_word.item()
#         ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
#         # 将当前时刻解码的预测输出结果，同之前所有的结果堆叠作为输入再去预测下一个词。
#         if next_word == config.EOS_IDX:  # 如果当前时刻的预测输出为结束标志，则跳出循环结束预测。
#             break
#     return ys

# def translate_to_right(src, config):
#     data_loader = VariantWordDataset("train", config.source_dic_path, config.target_dic_path)
#     translation_model = ConvS2SModel(config)
#     translation_model = translation_model.to(config.device)
#     torch.load("../../Weights_2/ConvS2SModel-CrossEntropyLoss-epoch=11-valid_f1=0.97.ckpt", map_location="cpu")
#     r = translate(translation_model, src, data_loader, config)
#     return r


config = Config()
translation_model = ConvS2SModel(config)
translation_model = translation_model.to(config.device)
torch.load("../../Weights_2/ConvS2SModel-CrossEntropyLoss-epoch=11-valid_f1=0.97.ckpt", map_location="cpu")



srcs = ["9306你好,鉴于你良好的信誉,特聘请你~来我店帮忙工作（350/天）咨询,Q:707941883."]
tgts = ["9306你好,鉴于你良好的信誉,特聘请你来我店帮忙工作(350天)咨询,Q:707941883."]
for i, src in enumerate(srcs):
    r = translate_to_right(src, config)
    print(f"德语：{src}")
    print(f"翻译：{r}")
    print(f"英语：{tgts[i]}")
    # print(len([src]))
    # print(len([tgts[i]]))
    # print([tgts[i]])
    print([[i for i in src]])
    print([[[i for i in tgts[i]]]])
    print(bleu_score([[i for i in src]], [[[i for i in tgts[i]]]]))

srcs =  tokens = [source_dic.word2idx[i] for i in list(src)]

translation_model(srcs, tgts)

In [None]:
import sys
sys.path.append("../../")
sys.path.append("..")
from Model.ConvS2SModel import ConvS2SModel
from Utils.Variant_word import VariantWordDataset
from Utils.Dictionary import Dictionary
import torch
from Utils.config import Config
from torchtext.data.metrics import bleu_score

config = Config()

# 字典加载
source_dic = Dictionary.load_from_file("../Data/source_vocal.pkl")
target_dic = Dictionary.load_from_file("../Data/target_vocal.pkl")



# 模型载入
# translation_model = ConvS2SModel(config)
# translation_model = translation_model.to("cpu")
# torch.load("../../Weights_2/ConvS2SModel-CrossEntropyLoss-epoch=11-valid_f1=0.97.ckpt", map_location="cpu")


# 测试句子
srcs = ["9306你好,鉴于你良好的信誉,特聘请你~来我店帮忙工作（350/天）咨询,Q:707941883.,特聘请你~来我店帮忙工作（350/天）咨询,Q:707941883."]
tgts = ["9306你好,鉴于你良好的信誉,特聘请你来我店帮忙工作(350天)咨询,Q:707941883.9306你好,鉴于你良好的信誉,特聘请你来我店帮忙工作(350天)咨询,Q:707941883."]


# src = torch.LongTensor([ source_dic.word2idx[i] for i in srcs[0] ]).unsqueeze(0).to(device="cpu")
# tgt = torch.LongTensor([ source_dic.word2idx[i] for i in tgts[0] ]).unsqueeze(0).to(device="cpu")

# src_length = torch.LongTensor(src.size(0))
# type(src_length)



for i, src in enumerate(srcs):
    print(f"德语：{srcs}")
    print(f"英语：{tgts[i]}")
    # print(len([src]))
    # print(len([tgts[i]]))
    # print([tgts[i]])
    print([[i for i in src]])
    print([[[i for i in tgts[i]]]])
    print(bleu_score([[i for i in src]], [[[i for i in tgts[i]]]]))


In [40]:
# 翻译测试
import sys
sys.path.append("../../")
sys.path.append("..")
sys.path.append(".")
from Model.ConvS2SModel import ConvS2SModel
from Model.TransformerModel import TransformerModel
from Model.RNNSearchModel import RNNSearchModel
from Utils.Variant_word import VariantWordDataset
import torch
from Utils.config import Config
from torchtext.data.metrics import bleu_score
import numpy as np
from tqdm import tqdm
import os
from Utils import VariantWordDataset
from torch.utils.data import DataLoader
from torch import nn,Tensor
from Utils.config import Config
from Model import *
import wandb

# 模型加载
config = Config()
config.source_dic_path = "../Data/source_vocal.pkl"
config.target_dic_path = "../Data/target_vocal.pkl"
config.src2tgt_path = "../Data/src_idx2tgt_idx.pkl"
config.train_set_path =  '../Data/Dataset/train_data.csv'
config.test_set_path =  '../Data/Dataset/test_data.csv'
config.train_set_supply_path =  '../Data/Dataset/train_data_supply_0417.csv'
# config.device =  'cpu'
config.d_model = 256
config.num_head = 8
config.num_encoder_layers = 3
config.num_decoder_layers = 3
config.dim_feedforward = 512


convs2s = ConvS2SModel(config)
convs2s.to(config.device).load_state_dict(torch.load("../Weights_ConvS2SModel-CrossEntropyLoss/ConvS2SModel-CrossEntropyLoss-epoch=33-valid_f1=0.98.ckpt")["state_dict"])
rnnsearch = RNNSearchModel(config)
rnnsearch.to(config.device).load_state_dict(torch.load("../Weights_RNNsearchModel-CrossEntropyLoss/RNNsearchModel-CrossEntropyLoss-epoch=34-valid_f1=0.91.ckpt")["state_dict"])
transformer = TransformerModel(config)
transformer.to(config.device).load_state_dict(torch.load("../Weights_TransformerModel-CrossEntropyLoss/TransformerModel-CrossEntropyLoss-epoch=34-valid_f1=0.94.ckpt")["state_dict"])


# 数据集构建
train_set = VariantWordDataset("train", config, isAligned=False, supply_ratio=0)
valid_set = VariantWordDataset("test", config, isAligned=False)
print(f"Train size: {len(train_set)}")


# dataloader 初始化
# 数据传输cpu数目
n_cpu = os.cpu_count()
train_dataloader = DataLoader(train_set, batch_size=1, shuffle=True, collate_fn=train_set.generate_batch, num_workers=n_cpu)
valid_dataloader = DataLoader(valid_set, batch_size=1, shuffle=False, collate_fn=valid_set.generate_batch, num_workers=n_cpu)
    

loading dictionary from ../Data/source_vocal.pkl
loading dictionary from ../Data/target_vocal.pkl
loading dictionary from ../Data/source_vocal.pkl
loading dictionary from ../Data/target_vocal.pkl
loading dictionary from ../Data/source_vocal.pkl
loading dictionary from ../Data/target_vocal.pkl
loading dictionary from ../Data/source_vocal.pkl
loading dictionary from ../Data/target_vocal.pkl
loading dictionary from ../Data/source_vocal.pkl
loading dictionary from ../Data/target_vocal.pkl
Train size: 6815


In [33]:
from torch.nn.utils.rnn import pad_sequence

def tran_ConvS2S(config, src, tgt, tgt_length, isAligned):

    start =time.clock()
    src = src.to(config.device).transpose(0,1)  # [ batch_size, src_len ]
    tgt = tgt.to(config.device).transpose(0,1)  # [ batch_size, tgt_len ]
    src_length = src.size(0)  # [ batch_size ]
    tgt_input = tgt[:, 1:-1]  # 解码部分的输入, [ batch_size, tgt_len ]
    decoder_out, lprobs  = convs2s(src, src_length, tgt_input)
    

    tgt_input = tgt_input.reshape(-1)
    decoder_out = decoder_out[:,:,:].argmax(axis=2).reshape(-1)
    print(decoder_out.device)
    decoder_out = np.delete(Tensor.cpu(decoder_out).numpy() , np.where(Tensor.cpu(tgt_input).numpy() <= config.GAP_IDX))
    tgt_input = np.delete(Tensor.cpu(tgt_input).numpy() , np.where(Tensor.cpu(tgt_input).numpy() <= config.GAP_IDX))

    end = time.clock()

    candidate = [str(train_set.target_dic.idx2word[i]) for i in decoder_out.tolist()]
    reference = [str(train_set.target_dic.idx2word[i]) for i in tgt_input.tolist()]
    print("".join(candidate))
    print("".join(reference))
    print(bleu_score([candidate], [[reference]]))
    print( end )
    print( start )
    
    return bleu_score([candidate], [[reference]]), end - start


def create_mask(vector, PAD_IDX):

    vector_mask = (vector != PAD_IDX)
    return vector_mask

def traverse(tensor,  PAD_IDX, tgt_lengths):
    # 先去除所有的PAD
    new_tensor = []

    for i, length in zip(tensor, tgt_lengths.cpu()):
        i = i[:length]
            # 倒序
        new_tensor.append( i.flip(0) )

    # PAD
    tensor = pad_sequence(new_tensor, padding_value=PAD_IDX, batch_first=True)  # [de_len,batch_size]

    return tensor

def tran_RNNSearch(config, src, tgt, tgt_length, isAligned):

    start =time.clock()

    src = src.to(config.device).transpose(0, 1) # [batch_size, src_len]
    tgt = tgt.to(config.device).transpose(0, 1)
    a = np.delete(Tensor.cpu(tgt).numpy() , np.where(Tensor.cpu(tgt).numpy() <= config.GAP_IDX))
    print(a)
    reference = [str(train_set.target_dic.idx2word[i]) for i in a.tolist()]
    print("".join(reference))
    forward_tgt = tgt
    backward_tgt = traverse(tgt, config.PAD_IDX, tgt_length)

    src_mask = create_mask(src, config.PAD_IDX)
    forward_tgt_mask = create_mask(forward_tgt, config.PAD_IDX)
    backward_tgt_mask = create_mask(backward_tgt, config.PAD_IDX)

        
    # logits 输出shape为[tgt_len,batch_size,tgt_vocab_size]
    loss, w_loss, output = rnnsearch(
        src = src,                   # Encoder的token序列输入，[src_len,batch_size]
        src_mask = src_mask, 
        f_trg = forward_tgt, 
        f_trg_mask = forward_tgt_mask,
        b_trg=backward_tgt, 
        b_trg_mask=backward_tgt_mask)

    output = output[:,1:-1].reshape(-1)
    forward_tgt = forward_tgt[:,1:-1].reshape(-1)
    print(forward_tgt.shape)
    print(output.shape)

    output = np.delete(Tensor.cpu(output).numpy() , np.where(Tensor.cpu(forward_tgt).numpy() <= config.GAP_IDX))
    forward_tgt = np.delete(Tensor.cpu(forward_tgt).numpy() , np.where(Tensor.cpu(forward_tgt).numpy() <= config.GAP_IDX))

    end = time.clock()

    candidate = [str(train_set.target_dic.idx2word[i]) for i in output.tolist()]
    reference = [str(train_set.target_dic.idx2word[i]) for i in forward_tgt.tolist()]
    print("".join(candidate))
    print("".join(reference))
    # print(bleu_score([candidate], [[reference]]))
    return bleu_score([candidate], [[reference]]), end - start

def generate_square_subsequent_mask(config, sz):
    mask = (torch.triu(torch.ones((sz, sz), device=config.device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask_1(config, src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(config, tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=config.device).type(torch.bool)

    src_padding_mask = (src == config.PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == config.PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

def tran_transformer(config, src, tgt, tgt_length, _):

    start =time.clock()
    src = src.cuda()
    tgt = tgt.cuda()

    tgt_input = tgt[:-1, :]  # 解码部分的输入, [tgt_len,batch_size]
    src_mask, tgt_mask, src_padding_mask, tgt_padding_mask \
            = create_mask_1(config, src, tgt_input)
    
    # logits 输出shape为[tgt_len,batch_size,tgt_vocab_size]
    logits = transformer(
                            src=src,  # Encoder的token序列输入，[src_len,batch_size]
                            tgt=tgt_input,  # Decoder的token序列输入,[tgt_len,batch_size]
                            src_mask=src_mask,  # Encoder的注意力Mask输入，这部分其实对于Encoder来说是没有用的
                            tgt_mask=tgt_mask, # Decoder的注意力Mask输入，用于掩盖当前position之后的position [tgt_len,tgt_len]
                            src_key_padding_mask=src_padding_mask,  # 用于mask掉Encoder的Token序列中的padding部分
                            tgt_key_padding_mask=tgt_padding_mask,  # 用于mask掉Decoder的Token序列中的padding部分
                            memory_key_padding_mask=src_padding_mask)  # 用于mask掉Encoder的Token序列中的padding部分


    ### 计算loss
    tgt_out = tgt[1:, :]  # 解码部分的真实值  shape: [tgt_len,batch_size]
    # loss = self.loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))


    decoder_out = np.delete(Tensor.cpu( logits.reshape(-1, logits.shape[-1]).argmax(1)).detach().numpy() , np.where(Tensor.cpu( logits.reshape(-1, logits.shape[-1]).argmax(1)).detach().numpy()  <= config.GAP_IDX))
    tgt_out = np.delete(Tensor.cpu(tgt_out).numpy() , np.where(Tensor.cpu(tgt_out).numpy() <= config.GAP_IDX))

    end = time.clock()

    candidate = [str(train_set.target_dic.idx2word[i]) for i in decoder_out.tolist()]
    reference = [str(train_set.target_dic.idx2word[i]) for i in tgt_out.tolist()]
    print("".join(candidate))
    print("".join(reference))
    print(bleu_score([candidate], [[reference]]))
    # print(end - start)
    return bleu_score([candidate], [[reference]]), end - start

In [41]:
from torch import nn,Tensor
import time
time1 =float(0)
time2 = float(0)
time3 = float(0)
times = 0
# 计算ConvS2S模型时间
for index, (src, tgt, tgt_length, _) in enumerate(valid_dataloader):

    if index < 1:
        bleu1, timec = tran_ConvS2S(config, src, tgt, tgt_length, _)
        bleu1, timet = tran_transformer(config, src, tgt, tgt_length, _)
        bleu1, timer = tran_RNNSearch(config, src, tgt, tgt_length, _)

        time1+= timec
        time2 += timet
        time3 += timer
        times += 1
    else:
        break
print(times)
print(time1/times)
print(time2/times)
print(time3/times)

cuda:0
赖晚开出(46虎),您中了吗?加微:18683367080我可以告诉你下次的一肖三码,早加就知道
昨晚开出(46虎),您中了吗?加微:18683367080我可以告诉你下次的一肖三码,早加就知道
0.9784820675849915
354.695067
354.659237
昨晚开出(46虎),您中了?wei:18683367080我可可告诉你下次的一肖三码,早加
晚开出(46虎),您中了吗?加微:18683367080我可以告诉你下次的一肖三码,早加就知道
0.7600666967065693
[3188 3165 1781 2526  262  479  716  109 1086 3283 1881   44 3279 1435
 1054 1419  351  453  748 3359  716 3359 2927 2927  716 1587 1480 3359
 1480 2440 3081  688 2936  711 2587 1495  317 1711 3185  886 1598 1138
 3283  169 1419 1504  318 2824]
昨晚开出(46虎),您中了吗?加微:18683367080我可以告诉你下次的一肖三码,早加就知道
torch.Size([48])
torch.Size([48])
昨晚开出(46[GAP]机,您中了解?加wei6833677080[GAP]我可以告你你次次一肖三码,早加就知道
昨晚开出(46虎),您中了吗?加微:18683367080我可以告诉你下次的一肖三码,早加就知道


  logit.data.masked_fill_(1 - mask, -float('inf'))


1
0.035829999999975826
0.011779999999987467
0.08268199999997705


In [31]:
string = "昨晚开出(46虎),您中了吗?加微:18683367080我可以告诉你下次的一肖三码,早加就知道"
list_1 = list(string)
list_2 = []

start = time.clock()
for i in list_1:
    list_2.append(train_set.target_dic.word2idx[i])
end = time.clock()
end - start

0.00023599999997259147