In [29]:
import pandas as pd
import torch

torch.cuda.current_device()
from torch import optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, BertTokenizer
from seqeval.metrics import f1_score

# 路径
TRAIN_PATH = '../data/ccks_2019_train_split.csv'
TEST_PATH = '../data/ccks_2019_test_split.csv'
MODEL_PATH1 = '../model/'
MODEL_PATH2 = '../model/'
MODEL_PATH_MAC = '../model/'
MODEL_LOCAL_PATH = '../RoBERTa_zh_L12_PyTorch'

# 超参数
MAX_LEN = 256
BATCH_SIZE = 32
EPOCH = 50

# 预设
# 设备
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
# tag2index
tag2index = {'O': 0,
 'B-解剖部位': 1,
 'I-解剖部位': 2,
 'B-手术': 3,
 'I-手术': 4,
 'B-疾病和诊断': 5,
 'I-疾病和诊断': 6,
 'B-影像检查': 7,
 'I-影像检查': 8,
 'B-药物': 9,
 'I-药物': 10,
 'B-实验室检验': 11,
 'I-实验室检验': 12}
index2tag = {v: k for k, v in tag2index.items()}


In [30]:
# 预处理
def data_preprocessing(dataset, is_train):
    # 数据str转化为list
    dataset['text_split'] = dataset['text'].apply(list)
    # token
    tokenizer = BertTokenizer.from_pretrained(MODEL_LOCAL_PATH)
    texts = dataset['text_split'].array.tolist()
    token_texts = []
    for text in tqdm(texts):
        tokenized = tokenizer.encode_plus(text=text,
                                          max_length=MAX_LEN,
                                          return_token_type_ids=True,
                                          return_attention_mask=True,
                                          return_tensors='pt',
                                          padding='max_length',
                                          truncation=True)
        token_texts.append(tokenized)

    # 训练集有tag，测试集没有tag
    tags = None
    if is_train:
        dataset['tag'] = dataset['BIO_anno'].apply(lambda x: x.split(sep=' '))
        tags = []
        for tag in tqdm(dataset['tag'].array.tolist()):
            index_list = [0] + [tag2index[t] for t in tag] + [0]
            if len(index_list) < MAX_LEN:  # 填充
                pad_length = MAX_LEN - len(index_list)
                index_list += [tag2index['O']] * pad_length
            if len(index_list) > MAX_LEN:  # 裁剪
                index_list = index_list[:MAX_LEN-1] + [0]
            tags.append(index_list)
        tags = torch.LongTensor(tags)

    return token_texts, tags


In [31]:
import torch
from torch import nn
from torchcrf import CRF
from transformers import BertModel
from torch.utils.data import Dataset


class Bert_BiLSTM_CRF(nn.Module):
    def __init__(self, tag2index):
        super(Bert_BiLSTM_CRF, self).__init__()
        self.tagset_size = len(tag2index)

        # bert层
        self.bert = BertModel.from_pretrained(MODEL_LOCAL_PATH)
        # config = self.bert.config
        # lstm层
        self.lstm = nn.LSTM(input_size=768, hidden_size=128, num_layers=1, batch_first=True, bidirectional=True)
        # dropout层
        self.dropout = nn.Dropout(p=0.1)
        # Dense层
        self.dense = nn.Linear(in_features=256, out_features=self.tagset_size)
        # CRF层
        self.crf = CRF(num_tags=self.tagset_size)

        # 隐藏层
        self.hidden = None

    # 负对数似然损失函数
    def neg_log_likelihood(self, emissions, tags=None, mask=None, reduction=None):
        return -1 * self.crf(emissions=emissions, tags=tags, mask=mask, reduction=reduction)

    def forward(self, token_texts, tags):
        """
        token_texts:{"input_size": tensor,  [batch, 1, seq_len]->[batch, seq_len]
                    "token_type_ids": tensor,  [batch, 1, seq_len]->[batch, seq_len]
                     "attention_mask": tensor  [batch, 1, seq_len]->[batch, seq_len]->[seq_len, batch]
                     }
        tags:  [batch, seq_len]->[seq_len, batch]
        bert_out:  [batch, seq_len, hidden_size(768)]->[seq_len, batch, hidden_size]
        self.hidden:  [num_layers * num_directions, hidden_size(128)]
        out:  [seq_len, batch, hidden_size * 2(256)]
        lstm_feats:  [seq_len, batch, tagset_size]
        loss:  tensor
        predictions:  [batch, num]
        """
        texts, token_type_ids, masks = token_texts['input_ids'], token_texts['token_type_ids'], token_texts['attention_mask']
        texts = texts.squeeze(1)
        token_type_ids = token_type_ids.squeeze(1)
        masks = masks.squeeze(1)
        bert_out = self.bert(input_ids=texts, attention_mask=masks, token_type_ids=token_type_ids)[0]
        bert_out = bert_out.permute(1, 0, 2)
        # 检测设备
        device = bert_out.device
        # 初始化隐藏层参数
        self.hidden = (torch.randn(2, bert_out.size(0), 128).to(device),
                       torch.randn(2, bert_out.size(0), 128).to(device))
        out, self.hidden = self.lstm(bert_out, self.hidden)
        lstm_feats = self.dense(out)

        # 格式转换
        masks = masks.permute(1, 0)
        masks = masks.clone().detach().bool()
        # masks = torch.tensor(masks, dtype=torch.uint8)
        # 计算损失值和预测值
        if tags is not None:
            tags = tags.permute(1, 0)
            loss = self.neg_log_likelihood(lstm_feats, tags, masks, 'mean')
            predictions = self.crf.decode(emissions=lstm_feats, mask=masks)  # [batch, 任意数]
            return loss, predictions
        else:
            predictions = self.crf.decode(emissions=lstm_feats, mask=masks)
            return predictions

In [32]:
class NerDataset(Dataset):
    def __init__(self, token_texts, tags):
        super(NerDataset, self).__init__()
        self.token_texts = token_texts
        self.tags = tags

    def __getitem__(self, index):
        return {
            "token_texts": self.token_texts[index],
            "tags": self.tags[index] if self.tags is not None else None,
        }

    def __len__(self):
        return len(self.token_texts)


class NerDatasetTest(Dataset):
    def __init__(self, token_texts):
        super(NerDatasetTest, self).__init__()
        self.token_texts = token_texts

    def __getitem__(self, index):
        return {
            "token_texts": self.token_texts[index],
            "tags": 0
        }

    def __len__(self):
        return len(self.token_texts)

In [33]:
from torch import nn
from torchcrf import CRF
from transformers import BertModel


class Bert_CRF(nn.Module):
    def __init__(self, tag2index):
        super(Bert_CRF, self).__init__()
        self.tagset_size = len(tag2index)

        # bert层
        self.bert = BertModel.from_pretrained(MODEL_LOCAL_PATH)
        # dense层
        self.dense = nn.Linear(in_features=768, out_features=self.tagset_size)
        # CRF层
        self.crf = CRF(num_tags=self.tagset_size)

        # 隐藏层
        self.hidden = None

    def neg_log_likelihood(self, emissions, tags=None, mask=None, reduction=None):
        return -1 * self.crf(emissions=emissions, tags=tags, mask=mask, reduction=reduction)

    def forward(self, token_texts, tags):
        """
        token_texts:{"input_size": tensor,  [batch, 1, seq_len]->[batch, seq_len]
                    "token_type_ids": tensor,  [batch, 1, seq_len]->[batch, seq_len]
                     "attention_mask": tensor  [batch, 1, seq_len]->[batch, seq_len]->[seq_len, batch]
                     }
        tags:  [batch, seq_len]->[seq_len, batch]
        bert_out:  [batch, seq_len, hidden_size(768)]->[seq_len, batch, hidden_size]
        feats:  [seq_len, batch, tagset_size]
        loss:  tensor
        predictions:  [batch, num]
        """
        texts, token_type_ids, masks = token_texts.values()
        texts = texts.squeeze(1)
        token_type_ids = token_type_ids.squeeze(1)
        masks = masks.squeeze(1)
        bert_out = self.bert(input_ids=texts, attention_mask=masks, token_type_ids=token_type_ids)[0]
        bert_out = bert_out.permute(1, 0, 2)
        feats = self.dense(bert_out)

        # 格式转换
        masks = masks.permute(1, 0)
        masks = masks.clone().detach().bool()
        # 计算损失之和预测值
        if tags is not None:
            tags = tags.permute(1, 0)
            loss = self.neg_log_likelihood(feats, tags, masks, 'mean')
            predictions = self.crf.decode(emissions=feats, mask=masks)
            return loss, predictions
        else:
            predictions = self.crf.decode(emissions=feats, mask=masks)
            return predictions



In [34]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import precision_score, recall_score

# 计算f1值
def get_f1_score(tags, predictions):
    tags = tags.to('cpu').data.numpy().tolist()
    temp_tags = []
    final_tags = []

    int_tags = []
    f_int_tags = []
    predictions_int = []
    for index in range(len(predictions)):
        # predictions先去掉头，再去掉尾
        predictions[index].pop()
        length = len(predictions[index])
        temp_tags.append(tags[index][1:length])
        int_tags.append(tags[index][1:length])

        predictions[index].pop(0)
        predictions_int = predictions.copy()
        # 格式转化，转化为List(str)
        temp_tags[index] = [index2tag[x] for x in temp_tags[index]]
        predictions[index] = [index2tag[x] for x in predictions[index]]

        final_tags.append(temp_tags[index])
        f_int_tags.append(int_tags[index])


    f1 = f1_score(final_tags, predictions, average='micro')
    precision = 0
    predcision = precision_score(final_tags, predictions)
    recall =  recall_score(final_tags, predictions)

    return {
        'recall': recall,
        'predcision': predcision,
        'micro_f1': f1,
    }

In [35]:
# 训练
def train(train_dataloader, model, optimizer, epoch):
    for i, batch_data in enumerate(train_dataloader):
        token_texts = batch_data['token_texts'].to(DEVICE)
        tags = batch_data['tags'].to(DEVICE)
        loss, predictions = model(token_texts, tags)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 500 == 0:
            f1_obj = get_f1_score(tags, predictions)
            micro_f1 = f1_obj['micro_f1']
            predcision = f1_obj['predcision']
            recall = f1_obj['recall']
            print(f'Epoch:{epoch} | i:{i} | loss:{loss.item()} | Micro_F1:{micro_f1} | predcision: {predcision} | recall: {recall}')



In [16]:
train_dataset = pd.read_csv(TRAIN_PATH, encoding='utf8')
train_dataset

Unnamed: 0.1,Unnamed: 0,text,BIO_anno
0,0,，患者2008年9月3日因“腹胀，发现腹部包块”在我院腹科行手术探查，术中见盆腹腔肿物，与肠...,O O O O O O O O O O O O O O B-解剖部位 O O O O B-解...
1,1,于2015-7-6行剖腹探查+膀胱旁肿物切除+骶前肿物切除+肠表面肿物切除术，术程顺利，，术...,O O O O O O O O O O B-手术 I-手术 I-手术 I-手术 I-手术 I...
2,2,，患者于2011年9月29日在我院因“子宫内膜癌II期”在全麻上行“广泛全子宫切除+两侧附件...,O O O O O O O O O O O O O O O O O O O B-疾病和诊断 ...
3,3,术程顺利，，术后病理回报：腹水未见癌；（全子宫+两附件）送检子宫大小为10*6*4CM，宫腔...,O O O O O O O O O O O O O B-解剖部位 O O O O O O B...
4,4,于2011年10月11日、11月16日行TP（泰素+伯尔定）方案化疗2程，化疗后出现轻度恶心...,O O O O O O O O O O O O O O O O O O O O O O O ...
...,...,...,...
2695,2738,，患者1月前体检发现直肠肿物，，外院肠镜提示：直肠距肛门5-9CM肿物，，活检病理：腺癌，完...,O O O O O O O O O O B-解剖部位 I-解剖部位 O O O O O O ...
2696,2739,"腹泻 1度，皮肤反应1度，恶心0度，乏力1度，泌尿系0度，骨髓抑制0度,2016-6-2继续...",B-解剖部位 O O O O O O O O O O O O O O O O O O O O...
2697,2740,，患者2012-05-25前因“便血3月余”入院，经肠镜及病理检查确诊乙状结肠腺癌。外院影像...,O O O O O O O O O O O O O O O O O O O O O O O ...
2698,2741,2013年3月13日、2013年4月02日、2013年4月23日、2013年5月16日、20...,O O O O O O O O O O O O O O O O O O O O O O O ...


In [9]:
train_dataset = pd.read_csv(TRAIN_PATH, encoding='utf8')
# 数据预处理
token_texts, tags = data_preprocessing(train_dataset, is_train=True)
# 数据集装载

tags[20]

100%|██████████| 28933/28933 [00:13<00:00, 2216.88it/s]
100%|██████████| 28933/28933 [00:00<00:00, 39340.77it/s]


tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        5, 6, 6, 6, 6, 0, 0, 7, 8, 0, 0, 0, 5, 6, 6, 0, 1, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
        1, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [24]:
token_texts[20]


{'input_ids': tensor([[ 101, 3389,  860, 8038,  100, 8038,  124,  127,  119,  126,  360,  510,
          100, 8038,  128,  129, 3613,  120, 1146,  510,  100, 8038,  122,  129,
         3613,  120, 1146,  510,  100,  100,  122,  124,  121,  120,  129,  121,
          100,  155,  155,  100,  149, 8024,  860, 1798,  974, 4607, 8024, 4868,
         2562, 3926, 3504, 8024, 6427, 6241, 3837, 1164, 8024, 6134, 2658, 5632,
         4197, 8024, 3635, 1057, 4567, 2147, 8024, 5632, 1220,  860,  855, 8024,
         3389,  860, 1394,  868,  511, 2552, 5511, 5592, 7346, 2595, 8024, 5592,
         2398, 1788, 8024, 5498, 5569, 5490,  678, 3313, 1350, 8024, 1059, 5592,
         3313, 6239, 1350, 2460, 2382, 1259, 1779, 8024, 3187, 1327, 4578, 1353,
         6663, 4578, 1350, 5491, 5165, 2476,  511, 5592, 1371, 4919, 1220, 2595,
         3843, 7509, 7346, 2595, 8039, 5499, 7885, 7509, 3633, 2382, 8024, 3313,
         7319, 1350, 7770, 6444, 5499, 7885, 7509, 1350, 3698, 6814, 3717, 1898,
          511,

In [25]:
token_texts[20]['attention_mask'].shape


torch.Size([1, 256])

In [36]:
def execute():
    # 加载训练集
    train_dataset = pd.read_csv(TRAIN_PATH, encoding='utf8')

    # 数据预处理
    token_texts, tags = data_preprocessing(train_dataset, is_train=True)
    # 数据集装载
    train_dataset = NerDataset(token_texts, tags)
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    # 构建模型
    model = Bert_BiLSTM_CRF(tag2index=tag2index).to(DEVICE)
    #model = Bert_CRF(tag2index=tag2index).to(DEVICE)
    # 初始化模型参数优化器

    crf_params = list(map(id, model.crf.parameters()))

    base_params = filter(lambda p: id(p) not in crf_params,
                     model.parameters())

    optimizer_params = [
          {'params': base_params},
          {'params': model.crf.parameters(), 'lr': 2e-5 * 100},
    ]

    optimizer = optim.AdamW(optimizer_params, lr=2e-5)
    print(f"GPU_NAME:{torch.cuda.get_device_name()} | Memory_Allocated:{torch.cuda.memory_allocated()}")
    # 模型训练
    for i in range(EPOCH):
        print(f"{i} Epoch")
        train(train_dataloader, model, optimizer, i)
        # 保存模型
        torch.save(model.state_dict(), MODEL_PATH2 + str(i) + '.pkl')
        test_result = test(MODEL_PATH2 + str(i) + '.pkl', EPOCH)
        #print(test_result)
    # 保存模型
    torch.save(model.state_dict(), MODEL_PATH_MAC + 'final.pkl')



In [37]:
from seqeval.metrics import classification_report

#测试集预测实体标签
def test(model_path, model_id):
    # 加载数据集
    test_dataset = pd.read_csv(TEST_PATH, encoding='utf8')
    #测试集的tags_list
    test_tags_true = test_dataset['BIO_anno'].apply(lambda x: x.split(sep=' '))
    test_tags_true = test_tags_true.array.tolist()
    # 数据预处理
    token_texts, _ = data_preprocessing(test_dataset, is_train=False)
    # 装载测试集
    dataset_test = NerDatasetTest(token_texts)
    test_dataloader = DataLoader(dataset=dataset_test, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    # 构建模型
    model = Bert_BiLSTM_CRF(tag2index).to(DEVICE)
    #model = Bert_CRF(tag2index).to(DEVICE)
    model.load_state_dict(torch.load(model_path))
    # 模型预测
    model.eval()
    predictions_list = []
    with torch.no_grad():
        for i, batch_data in enumerate(test_dataloader):
            token_texts = batch_data['token_texts'].to(DEVICE)
            predictions = model(token_texts, None)
            predictions_list.extend(predictions)

    # 将预测结果转换为文本格式
    entity_tag_list = []
    result = []
    index2tag = {v: k for k, v in tag2index.items()}  # 反转字典
    for i, (text, predictions) in enumerate(zip(test_dataset['text'], predictions_list)):
        # 删除首位和最后一位
        predictions.pop()
        predictions.pop(0)
        text_entity_tag = []
        result_tag = []
        for c, t in zip(text, predictions):
            if t != 0:
                text_entity_tag.append(c + index2tag[t])
                result_tag.append(index2tag[t])
            else:
                result_tag.append('O')
        result.append(result_tag)
        entity_tag_list.append(" ".join(text_entity_tag))  # 合并为str并加入列表中

    result_df = pd.DataFrame(data=entity_tag_list, columns=['result'])
    result_df.to_csv('../result.csv')

    for t in range(len(test_tags_true)):
      if len(test_tags_true[t]) != len(result[t]):
        test_tags_true[t] = test_tags_true[t][0: len(result[t])]

    predcision = precision_score(test_tags_true, result)
    recall =  recall_score(test_tags_true, result)
    class_report = classification_report(test_tags_true, result, digits=6)

    print(f"{model_id} test f1_score = " + str(f1_score(test_tags_true, result)))
    print(f"{model_id} test predcision_score = " + str(predcision))
    print(f"{model_id} test recall_score = " + str(recall))
    print(f"{model_id} test report = " + class_report)
    
    return {
        "f1": f1_score(test_tags_true, result),
        "pred": predcision,
        'recall': recall,
        'report': class_report,
    }

In [None]:
execute()