In [1]:
import numpy as np
import pandas as pd
from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab
import torch
from torch.utils.data import Dataset, DataLoader
from bert_seq2seq.utils import load_bert, load_model_params
import time
from tqdm import tqdm
import json

In [2]:
vocab_path = "G:/bert_seq2seq\examples/state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
word2idx = load_chinese_base_vocab(vocab_path)

dict_bc = {
    '1a': '嗯/ok/嗯哼',
    '2a': '好的',
    '2b': '喔/嗷/昂/奥/这样呀',
    '2c': '明白/了解/哦/好吧',
    '3a': '真棒/666',
    '3b': '挺好/不错',
    '4a': '确实/好吧/对/是',
    '4b': '可以',
    '5a': '啊这',
    '5b': '怎会如此？',
    '5c': '嗯？/诶？/啊哈？',
    '5d': '是么？/真的么？',
    '5e': '什么？',
    '5f': '厉害',
    '5g': '哈哈哈',
    '6a': '然后呢？/之后呢？',
    '6b': '还有呢？/除此以外呢？',
    '6c': '具体说说',
    '6d': '比如说？/举个例子？',
    '6e': '随便说说',
    '6f': '为什么？'
}

target = list(dict_bc.keys())

In [4]:
## 自定义dataset
class NLUDataset(Dataset):
    """
    针对特定数据集，定义一个相关的取数据的方式
    """
    def __init__(self, sents_src, sents_tgt) :
        ## 一般init函数是加载所有数据
        super(NLUDataset, self).__init__()
        # 读原始数据
        # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir)
        self.sents_src = sents_src
        self.sents_tgt = sents_tgt

        self.idx2word = {k: v for v, k in word2idx.items()}
        self.tokenizer = Tokenizer(word2idx)

    def __getitem__(self, i):
        ## 得到单个数据
        # print(i)
        src = self.sents_src[i]
        tgt = self.sents_tgt[i]
        token_ids, token_type_ids = self.tokenizer.encode(src)
        output = {
            "token_ids": token_ids,
            "token_type_ids": token_type_ids,
            "target_id": tgt
        }
        return output

    def __len__(self):
        return len(self.sents_src)

def padding(indice, max_length, pad_idx=0):
    """
    pad 函数
    """
    pad_indice = [[pad_idx] * max(0, max_length - len(item)) + item for item in indice]
    return torch.tensor(pad_indice)

def collate_fn(batch, max_len=256):
    """
    动态padding， batch为一部分sample
    """
    token_ids = [data["token_ids"] for data in batch]
    max_length = max([len(t) for t in token_ids])
    token_type_ids = [data["token_type_ids"] for data in batch]
    target_ids = [data["target_id"] for data in batch]
    target_ids = torch.tensor(target_ids, dtype=torch.float)

    token_ids_padded = padding(token_ids, max_length)
    token_type_ids_padded = padding(token_type_ids, max_length)
    # target_ids_padded = token_ids_padded[:, 1:].contiguous()
    if max_length > max_len:
        token_ids_padded = token_ids_padded[:, -max_len:]
        token_type_ids_padded = token_type_ids_padded[:, -max_len:]

    return token_ids_padded, token_type_ids_padded, target_ids

def get_tgt_tensor(_tgt):
    index_list = [target.index(t) for t in _tgt]
    _zeros = np.zeros(len(target))
    _zeros[index_list] = 1
    return _zeros

def read_corpus(data_path):
    """
    读原始数据
    """
    _src = []
    _tgt = []

    with open(data_path, encoding='utf-8') as f:
        lines = f.readlines()
    for _line in lines:
        _l = _line[:-1].split("\t")
        _tgt.append(get_tgt_tensor(_l[1:]))
        _src.append(_l[0])
    return _src, _tgt

In [None]:
src, tgt = read_corpus("G:/bert_seq2seq\examples\data/data_ali_id_out_new.txt")

# check Data
for idx in range(5):
    print(src[idx], tgt[idx])

In [5]:
class Trainer:
    def __init__(self, data_path, model_name, model_path, batch_size=8 ,lr=1e-5, tarin_ratio=.8):
        # 加载数据
        self.sents_src, self.sents_tgt = read_corpus(data_path)
        self.tokenizer = Tokenizer(word2idx)
        # 判断是否有可用GPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(word2idx, model_name=model_name, model_class="cls", target_size=len(target))
        ## 加载预训练的模型参数～
        load_model_params(self.bert_model, model_path)
        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = NLUDataset(self.sents_src, self.sents_tgt)
        train_num = int(tarin_ratio*len(dataset))
        train, validation = torch.utils.data.random_split(dataset,
                                                          [train_num, len(dataset) - train_num])
        torch.save(train, 'G:/bert_seq2seq\examples\data/train_dataset.pt')
        torch.save(validation, 'G:/bert_seq2seq\examples\data/validation_dataset.pt')
        self.dataloader_train =  DataLoader(train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
        self.dataloader_validate = DataLoader(validation, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    def train(self, train_epoches, model_save_path):
        # 一个epoch的训练
        for epoch in range(train_epoches):
            # 训练一个epoch
            self.bert_model.train()
            self.iteration(epoch, model_save_path=model_save_path, train=True)

    def save(self, save_path):
        """
        保存模型
        """
        torch.save(self.bert_model.state_dict(), save_path)
        print("{} saved!".format(save_path))

    def iteration(self, _epoch, model_save_path, train=True):
        total_loss = 0
        validate_loss = 0
        count = 0
        count_vali = 0
        start_time = time.time() ## 得到当前时间
        step = 0
        for token_ids, token_type_ids, target_ids in tqdm(self.dataloader_train, position=0, leave=True):
            step += 1
            if step % 100 == 0:
                self.bert_model.eval()
            #     test_data = ["编剧梁馨月讨稿酬六六何念助阵 公司称协商解决", "西班牙BBVA第三季度净利降至15.7亿美元", "基金巨亏30亿 欲打开云天系跌停自救"]
                for token_ids_validate, token_type_ids_validate, target_ids_validate in self.dataloader_validate:
                    token_ids_validate = token_ids_validate.to(self.device)
                    # token_type_ids = token_type_ids.to(self.device)
                    target_ids_validate = target_ids_validate.to(self.device)
                    _, loss_v = self.bert_model(token_ids_validate, labels=target_ids_validate,)
                    validate_loss += loss_v.item()
                    count_vali += 1
                self.bert_model.train()

            token_ids = token_ids.to(self.device)
            # token_type_ids = token_type_ids.to(self.device)
            target_ids = target_ids.to(self.device)
            # 因为传入了target标签，因此会计算loss并且返回
            _, loss = self.bert_model(token_ids, labels=target_ids,)
            # 反向传播
            if train:
                # 清空之前的梯度
                self.optimizer.zero_grad()
                # 反向传播, 获取新的梯度
                loss.backward()
                # 用获取的梯度更新模型参数
                self.optimizer.step()

            # 为计算当前epoch的平均loss
            total_loss += loss.item()
            count += 1
        end_time = time.time()
        spend_time = end_time - start_time
        # 打印训练信息
        print("epoch is " + str(_epoch)+". loss is " + str(total_loss/count)
              + "validation loss is " + str(validate_loss/count_vali) + ". spend time is "+ str(spend_time))
        # 保存模型
        self.save(model_save_path)
        return  total_loss/count, validate_loss/count_vali

In [6]:
trainer = Trainer(data_path="G:/bert_seq2seq\examples\data/data_ali_id_out_new.txt", model_name="roberta",
                  model_path="G:/bert_seq2seq\examples/state_dict/roberta_wwm_pytorch_model.bin")
train_epoches = 10
trainer.train(train_epoches, 'G:/bert_seq2seq\examples/bc_model_2770.bin')

device: cuda
G:/bert_seq2seq\examples/state_dict/roberta_wwm_pytorch_model.bin loaded!


100%|██████████| 277/277 [00:43<00:00,  6.44it/s]


epoch is 0. loss is 0.31868593383997357validation loss is 0.2997747175395489. spend time is 43.03589940071106


  0%|          | 0/277 [00:00<?, ?it/s]

G:/bert_seq2seq\examples/bc_model_2770.bin saved!


100%|██████████| 277/277 [00:42<00:00,  6.52it/s]


epoch is 1. loss is 0.28079838663447204validation loss is 0.2897897785263402. spend time is 42.505953311920166


  0%|          | 0/277 [00:00<?, ?it/s]

G:/bert_seq2seq\examples/bc_model_2770.bin saved!


100%|██████████| 277/277 [00:44<00:00,  6.19it/s]


epoch is 2. loss is 0.2746343242562635validation loss is 0.2844059513083526. spend time is 44.75880455970764


  0%|          | 0/277 [00:00<?, ?it/s]

G:/bert_seq2seq\examples/bc_model_2770.bin saved!


100%|██████████| 277/277 [00:44<00:00,  6.23it/s]


epoch is 3. loss is 0.2677312177010822validation loss is 0.28142428025603294. spend time is 44.48710322380066


  0%|          | 1/277 [00:00<00:42,  6.43it/s]

G:/bert_seq2seq\examples/bc_model_2770.bin saved!


100%|██████████| 277/277 [00:43<00:00,  6.31it/s]


epoch is 4. loss is 0.25944440400342217validation loss is 0.27794495256883756. spend time is 43.920148849487305


  0%|          | 1/277 [00:00<00:48,  5.64it/s]

G:/bert_seq2seq\examples/bc_model_2770.bin saved!


100%|██████████| 277/277 [00:42<00:00,  6.46it/s]


epoch is 5. loss is 0.24842019083267514validation loss is 0.276798551423209. spend time is 42.87047004699707


  0%|          | 1/277 [00:00<00:28,  9.64it/s]

G:/bert_seq2seq\examples/bc_model_2770.bin saved!


100%|██████████| 277/277 [00:43<00:00,  6.36it/s]


epoch is 6. loss is 0.23392900434534472validation loss is 0.2770306668111256. spend time is 43.57216501235962


  0%|          | 1/277 [00:00<00:34,  7.96it/s]

G:/bert_seq2seq\examples/bc_model_2770.bin saved!


100%|██████████| 277/277 [00:43<00:00,  6.34it/s]


epoch is 7. loss is 0.21652083168821645validation loss is 0.2801760519189494. spend time is 43.65998411178589


  0%|          | 1/277 [00:00<00:49,  5.54it/s]

G:/bert_seq2seq\examples/bc_model_2770.bin saved!


100%|██████████| 277/277 [00:43<00:00,  6.32it/s]


epoch is 8. loss is 0.20100871172299883validation loss is 0.28975634063993183. spend time is 43.8647882938385


  0%|          | 0/277 [00:00<?, ?it/s]

G:/bert_seq2seq\examples/bc_model_2770.bin saved!


100%|██████████| 277/277 [00:43<00:00,  6.31it/s]


epoch is 9. loss is 0.18755484267477524validation loss is 0.2954736953335149. spend time is 43.883718490600586
G:/bert_seq2seq\examples/bc_model_2770.bin saved!


In [5]:
tokenizer = Tokenizer(word2idx)
# 判断是否有可用GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: " + str(device))
# 定义模型
bert_model = load_bert(word2idx, model_name="roberta", model_class="cls", target_size=len(target))

## 加载预训练的模型参数～
load_model_params(bert_model, "G:/bert_seq2seq\examples/bc_model_2770.bin")
# 将模型发送到计算设备(GPU或CPU)
bert_model.to(device)

device: cuda
G:/bert_seq2seq\examples/bc_model_2770.bin loaded!


BertClsClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
 

In [15]:
def test_model(txt, model):
    token_ids, token_type_ids = tokenizer.encode(txt)
    # token_ids = padding([token_ids], max_length=256, ).to(device)
    token_ids = torch.tensor(token_ids, device=device).view(1, -1)
    prediction = model(token_ids).tolist()[0]
    prediction = pd.Series(index=target, data=prediction).sort_values(ascending=False)
    bcs = []
    for idx, prob in prediction.iteritems():
        bcs.append((idx, dict_bc[idx], prob))
    return bcs

In [16]:
test_text = ["坐地铁，然后有时候开车，主要看公司那边那个停车位方不方便",
             "基本上不用了",
             "然后它有那种就是那个，安上去的贴上去的拿手机控制，然后就比如说我屋里就买了一个，在那卧室，然后就是。比如我我我躺床上要睡觉了，然后我必须关灯再上床，这样已经黑了，我怎么也看不见东西，然后再给遥控，我就直接睡觉时在手机躺床一关就行。",
             "对",
             "对自己能感应比如天亮了，也可以设置它几点几点的自动开，他他自己也会就是根据天气而定，比如天光达到一定亮，他就直接打到。就是那个，就是用用阳光校订器判定",
             "小时候见过的",
             "不会",
             "游戏类的，然后汽车类的美食、旅游的差不多这几个大类，看的比较多。",
             "然后就安排工作嘛，处理一下，如果有外出的，就把那个行程安排一下，然后就去这个忙那个相关的事情，然后下午因为有些时候比如说你出去的比较远,地点有几个的话就可以那个，如果早一点完了就可以早一点回家，然后如果公司里面还有事情，还要先回公司，然后再回来",
             "嗯，就是电视和灯光。"
             ]
for text in test_text:
    print(text + '\n', test_model(text, bert_model)[:5])

坐地铁，然后有时候开车，主要看公司那边那个停车位方不方便
 [('3b', '挺好/不错', 0.7948032021522522), ('2b', '喔/嗷/昂/奥/这样呀', 0.7623468041419983), ('5g', '哈哈哈', 0.7367010116577148), ('6a', '然后呢？/之后呢？', 0.720032811164856), ('4a', '确实/好吧/对/是', 0.6540706157684326)]
基本上不用了
 [('4a', '确实/好吧/对/是', 0.666114091873169), ('2b', '喔/嗷/昂/奥/这样呀', 0.6237576007843018), ('6a', '然后呢？/之后呢？', 0.516461968421936), ('3b', '挺好/不错', 0.4081690013408661), ('5g', '哈哈哈', 0.3531256914138794)]
然后它有那种就是那个，安上去的贴上去的拿手机控制，然后就比如说我屋里就买了一个，在那卧室，然后就是。比如我我我躺床上要睡觉了，然后我必须关灯再上床，这样已经黑了，我怎么也看不见东西，然后再给遥控，我就直接睡觉时在手机躺床一关就行。
 [('2b', '喔/嗷/昂/奥/这样呀', 0.493001252412796), ('4a', '确实/好吧/对/是', 0.39568015933036804), ('5b', '怎会如此？', 0.27955082058906555), ('5f', '厉害', 0.2777303457260132), ('3b', '挺好/不错', 0.24921633303165436)]
对
 [('2b', '喔/嗷/昂/奥/这样呀', 0.9273006319999695), ('3b', '挺好/不错', 0.5994013547897339), ('6c', '具体说说', 0.38726145029067993), ('5f', '厉害', 0.33286094665527344), ('6a', '然后呢？/之后呢？', 0.24595743417739868)]
对自己能感应比如天亮了，也可以设置它几点几点的自动开，他他自己也会就是根据天气而定，比如天光达到一定亮，他就直接打到。

In [None]:
label_data = pd.DataFrame(data=tgt, columns=list(dict_bc.values()))

In [None]:
total = label_data.apply(lambda x:x.sum(), axis=0)
percentage = total / label_data.shape[0]
print(percentage)

In [None]:
liubo_dialog = pd.read_excel("D:\DoobiePJ/2020Ali/ali全部语料\标注\新标注/to_do_data_917_liubo.xlsx")
anqi_dialog = pd.read_excel("D:\DoobiePJ/2020Ali/ali全部语料\标注\新标注/to_do_data_anqi.xlsx")
hzk_dialog = pd.read_excel("D:\DoobiePJ/2020Ali/ali全部语料\标注\新标注/to_do_data_hanzhankang.xlsx")
wpg_dialog = pd.read_excel("D:\DoobiePJ/2020Ali/ali全部语料\标注\新标注/to_do_data_wangpengguang.xlsx")
liubo_dialog['conversation'] = '917_liubo'
anqi_dialog['conversation'] = 'anqi'
hzk_dialog['conversation'] = 'hanzhankang'
wpg_dialog['conversation'] = 'wangpengguang'
dialog_data = pd.concat([liubo_dialog, anqi_dialog, hzk_dialog, wpg_dialog])

In [None]:
test_data = pd.read_excel("G:/bert_seq2seq\examples\data/back_channel_test.xlsx")

In [None]:
bc_idx_dict = {v : k for k, v in dict_bc.items()}
txt = []
for idx, row in test_data.iterrows():
    url = row['url']
    start = url.find('to_do_data_') + len('to_do_data_')
    end = url.find('？') - len('.jpg')
    info = url[start:end].split('-')
    print(info)
    select_row = dialog_data[(dialog_data['conversation']==info[0])&(dialog_data['Unnamed: 0']==int(info[1]))]
    stc = select_row['对话文本'].iloc[0]
    # 处理label
    labels = json.loads(row['label'])
    idx = []
    for label in labels:
        for l in label['option']:
            if l != '以上均不是':
                idx.append(bc_idx_dict[l])
    if len(idx) < 1:
        idx = ['1a', '2a']
    line = stc + '\t' + '\t'.join(idx) + '\n'
    txt.append(line)
    print(line)

In [None]:
with open("G:/bert_seq2seq\examples\data/back_channel_test.txt", "w", encoding="utf-8") as fp:
    fp.writelines(txt)
    fp.close()

In [None]:
src_test, tgt_test = read_corpus("G:/bert_seq2seq\examples\data/back_channel_test.txt")

In [None]:
corr = 0
pppredictions = []
for line in txt:
    data = line.split('\t')
    stc = data[0]
    if len(stc)>256:
        stc = stc[-256:]
    tgts = data[1:]
    predictions = test_model(stc, bert_model)
    pppredictions.append(predictions)
    for pred_ in predictions[:5]:
        if pred_[0] in tgts:
            corr += 1

In [None]:
print(corr/len(stc))

In [6]:
# train_data = torch.load('G:/bert_seq2seq\examples\data/train_dataset.pt')
validation_data = torch.load('G:/bert_seq2seq\examples\data/validation_dataset.pt')

the_dataloader = DataLoader(validation_data, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [7]:
label_list = list(dict_bc.keys())
def get_tgt(tgt_tensor):
    result = []
    for i, t in enumerate(tgt_tensor):
        if t == 1:
            result.append(label_list[i])
    return result

def get_pred(pred_tensor, top_n):
    pred_s = pd.Series(data=pred_tensor.tolist(), index=label_list)
    pred_s = pred_s.sort_values(ascending=False)
    return pred_s.index[:top_n]

def check_prediction(pre_tensor, tgt_tensor, top_n=5):
    targets = get_tgt(tgt_tensor)
    top_pred = get_pred(pre_tensor, top_n)
    for _p in top_pred:
        if _p in targets:
            return True
    return False

In [12]:
correct = 0
for token_ids, token_type_ids, target_ids in the_dataloader:
    token_ids = token_ids.to('cuda')
    # token_type_ids = token_type_ids.to(self.device)
    target_ids = target_ids.to('cuda')
    # 因为传入了target标签，因此会计算loss并且返回
    predictions, loss = bert_model(token_ids, labels=target_ids,)
    for _i, p_tensor in enumerate(predictions):
        if check_prediction(p_tensor, target_ids[_i], 1):
            correct += 1

In [None]:
print(get_tgt(target_ids[0]))
print(get_pred(predictions[0], 5))

In [None]:
455/554

In [None]:
388/554

In [13]:
print("n=5", str(253/554))
print("n=3", str(177/554))
print("n=1", str(84/554))

n=5 0.4566787003610108
n=3 0.3194945848375451
n=1 0.15162454873646208
