In [9]:
import torch
from transformers import BertConfig,BertTokenizer,BertModel,BertForMultipleChoice
from torch import nn
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import matplotlib.pyplot as plt
from IPython import display
from d2l import torch as d2l
import numpy as np
import pandas as pd

In [2]:
# 加载数据
excel_file = 'D:/基于深度学习的海量文本处理/第1阶段/10w.xlsx'
data_frame = pd.read_excel(excel_file)

In [3]:
# 停用词预处理
stop_words = ['您好','你好','很高兴为您服务','请问有什么可以帮您','client','user',':',' ']
def ProcessStopWords(text):
    for word in stop_words:
        text = text.replace(word,'')
    return text

data_frame['转写文本'] = data_frame['转写文本'].map(ProcessStopWords)

In [4]:
# 标签预处理
def ProcessLabels(text):
    word = '>>'
    text = text.replace(word,'>')
    return text

data_frame['服务请求'] = data_frame['服务请求'].map(ProcessLabels)

In [5]:
# 预览预处理结果
data_frame['转写文本'].iloc[:10]

0    哎我想问一下,我这个一三九的号码那个扣费方式是怎么样的扣费方式的话它是每个月呢就是每个月的一...
1    喂我那个宽带不能用宽带故障了是吗哦现在不能用前前几天就不能用了稍等我帮您看一下嗯稍等一下3天...
2    哎我我办理这个那个我想问一下我这个号码有没有开通5g套餐呀我看一下您这里有开通5g业务可以享...
3    请不要挂机您拨叫的用户正在通话中请唔好挂机您拨叫凯用户正在通话中嗯由于您多次没有声音我将结束...
4    喂我想咨询一下我的那个联通卡怎么回事啊,我都没用过嗯您可以提供一下吗这个号码就是我我号码我都...
5    嗯我想问一下我这个卡是啊升了5g上次是帮我升了5g然后现在是月租是54.5元上网费又是30一...
6                                                不要我有卡
7    我的号码为啥暂暂停服务了稍等一下我帮您看了一下的话您之前的话有反映过这个问题的是吧然后的话我...
8    哎你查一下我这个话费还有余额吗您这边还有8.95元怎么打不了怎么没有信号呢网络没有了呢呃信号...
9    哎先生哦嗯我想问一下现在如果这个号码要补号码的话那个身份证复印件有没有有效的补卡补卡要原件的...
Name: 转写文本, dtype: object

In [118]:
prompts = np.array(data_frame['转写文本'])
choices = np.array(data_frame['服务请求'])

In [127]:
# 去重choices，并保存原choices对应去重后的位置
unique_choices = np.unique(choices)
labels = np.array([np.argwhere(unique_choices==v)[0] for v in choices])
unique_choices.shape, labels.shape

((526,), (100000, 1))

In [183]:
# 基于 BertForMultipleChoice 模型
class BertForMutipleChoiceModel(nn.Module):
    def __init__(self, UseGPU, model_path, batch_size, max_length):
        super().__init__()
        self.batch_size = batch_size
        self.model_path = model_path
        self.max_length = max_length # 保存问题-答案最长编码
        self.model_config = BertConfig.from_pretrained(self.model_path)
        self.tokenizer = BertTokenizer.from_pretrained(self.model_path)
        self.bertchoice = BertForMultipleChoice.from_pretrained(self.model_path, config=self.model_config)
        self.try_gpu(UseGPU)
        
    def try_gpu(self, bUse):
        if torch.cuda.device_count() and bUse == True >= 1:
            self.device = 'cuda:0'
        else:
            self.device = 'cpu'
        self.bertchoice.to(self.device) # 迁移模型到device
    
    def format_choices(self, choices_text):
        '''choices_text是已去重的np array(num_choices,)，且对应的答案label在外部维护。
        对答案进行全局编码，方便后续使用，后续训练只需传入labels号'''
        self.num_choices = choices_text.shape[0] # 答案的数量
#         self.choices = np.repeat(np.reshape(choices_text, (1, -1)), self.batch_size, axis=0).tolist()
        self.choices = choices_text.tolist()
        
        
    def forward(self, prompt, labels):
        '''prompt是问题文本 np array（batch_size*1）, label是对应这个问题的答案标号(batch_size,)'''
        '''labels 为 tensor(batch_size, 1)'''
        prompts =  np.repeat(prompt, self.num_choices, axis=1).tolist() # 重复问题准备拼接
#         print('prompts', prompts)
        # 由于 tokenizer 每次只能一个样本，需要通过循环处理
        bFirst = True
        save_encoding = None
        for each_prompt in prompts:
#             print('each_prompt', each_prompt)
            encoding = self.tokenizer(each_prompt, self.choices, return_tensors='pt', padding='max_length', 
                                      truncation=True, max_length=self.max_length)
            if bFirst:
                bFirst = False
                save_encoding = encoding
                # 首次升维
                for key in save_encoding:
                    save_encoding[key] = save_encoding[key].unsqueeze(0)
#                 print('save_encoding1', save_encoding)
            else:
                for key in save_encoding.keys():
                    # 将 encodings 连接并输入模型
#                     print('wait_to_concat', encoding)
                    save_encoding[key] = torch.concat((save_encoding[key], encoding[key].unsqueeze(0)), dim=0)
                
#             print('encoding', encoding)
#         print('save_encoding', save_encoding)

        # 迁移到 device
        labels = labels.to(self.device)
        for key in save_encoding.keys(): # for 循环迭代器
            save_encoding[key] = save_encoding[key].to(self.device)
    
        outputs = self.bertchoice(**{k: v for k, v in save_encoding.items()}, labels=labels) # 第 0 维为 batch_size

        return outputs

In [192]:
a = torch.tensor([2])
a = a.to('cuda')
a
dic = {'s':a}
for v in dic.values():
    v = v.to('cpu')
for v in dic.values():
    print(v)
# print(dic)

tensor([2], device='cuda:0')


In [186]:
# 测试模型
m = BertForMutipleChoiceModel(True, './bert-base-chinese/', 2, 100)
prompt = np.array([["你是大聪明还是大傻逼"],["你是学霸还是学渣"]])
# print(prompt.shape)
choices = np.array(["大聪明.","大傻逼."])
labels = torch.tensor([1, 0])
m.format_choices(choices)
print('choices', np.array(m.choices))
print(m.forward(prompt,labels=labels))

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at ./bert-base-chinese/ and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


choices ['大聪明.' '大傻逼.']


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

In [None]:
# 创建 Dataloader 准备训练

In [141]:
# 官方示例代码
model_path = './bert-base-chinese/'
# m = BertForMultipleChoice.from_pretrained(model_path)


tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForMultipleChoice.from_pretrained(model_path)

prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
print(encoding)
outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels)  # batch size is 1

# the linear classifier still needs to be trained
loss = outputs.loss
logits = outputs.logits

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at ./bert-base-chinese/ and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[  101,   100,   100,   117, 10315,  9342,  8180, 11667,  8217, 12244,
          8315,  9738, 13288,   117, 11541,  8370,  8975,  8243,   143, 10637,
           117,  8310, 11685, 11836,  8303,   163,  8727, 11809,  8303,   119,
           102,   100,  8310,  9714, 11598,  8663,   143,  8330,  8197,  8256,
           143,   153,  8833,  9568,   119,   102],
        [  101,   100,   100,   117, 10315,  9342,  8180, 11667,  8217, 12244,
          8315,  9738, 13288,   117, 11541,  8370,  8975,  8243,   143, 10637,
           117,  8310, 11685, 11836,  8303,   163,  8727, 11809,  8303,   119,
           102,   100,  8310,  9714, 11598,   165,  8963,  8268,  9245,  8635,
          8217,  8174, 12126,  8168,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0