In [1]:
from model import *
# from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# coding: UTF-8
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
from datetime import timedelta
import time, json, random, os

class Config(object):

    """配置参数"""
    def __init__(self):
        self.database_path = r"/home/liangzida/workspace/ZhongYiPapers/manual_labeling_html/Database/"
        self.titles_keywords_path = self.database_path + '/titles_keywords.json'
        self.classified_titles_keywords_path = self.database_path + '/classified_titles_keywords.json'
        self.vocab_path = self.database_path + '/vocab.pkl'                            # 词表
        save_paths = os.listdir(self.database_path + '/Transformer/saved_dict/')
        exp_index = 1
        while True:
            if str(exp_index) not in save_paths:
                break
            else:
                exp_index += 1
        os.makedirs(self.database_path + '/Transformer/saved_dict/' + str(exp_index))
        self.save_path = self.database_path + '/Transformer/saved_dict/' + str(exp_index) + '/model'          # 模型训练结果
        self.log_path = self.database_path + '/Transformer/log/model'
        self.class_list = ['xiyi', 'zhongyi']                                          # 类别名单
        self.vocab_list = [x[:-1] for x in open(
            self.database_path + '/vocab.txt', encoding='utf-8').readlines()]          # 类别名单
        self.vocab_size = len(self.vocab_list)                                         # 类别数
        self.num_classes = len(self.class_list)                                        # 类别数
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')     # 设备
        self.train_ratio = 0.9

        self.dropout = 0.5                                              # 随机失活
        self.require_improvement = 3000000                                 # 若超过1000epoch效果还没提升，则提前结束训练
        self.n_vocab = None                                             # 词表大小，在运行时赋值
        self.num_epochs = 200                                           # epoch数
        self.batch_size = 128                                           # mini-batch大小
        self.shuffle = True
        self.padding = False
        self.pad_size = 250                                             # 每句话处理成的长度(短填长切)
        self.learning_rate = 5e-4                                       # 学习率
        self.embed = 300                                                # 字向量维度
        self.dim_model = 300
        self.hidden = 1024
        self.last_hidden = 512
        self.num_head = 5
        self.num_encoder = 2

MAX_VOCAB_SIZE = 10000  # 词表长度限制
CLS, SEP, MASK, UNK, PAD = '<CLS>', '<SEP>', '<MASK>', '<UNK>', '<PAD>'

In [None]:

class DatasetIterater(object):
    def __init__(self, batches, batch_size, shuffle, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False  # 记录batch数量是否为整数
        if (self.n_batches==0) or (len(batches) % self.n_batches != 0):
            self.residue = True
        self.index = 0
        self.shuffle = shuffle
        self.device = device
        if self.shuffle:
            random.shuffle(self.batches)

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)

        # pad前的长度(超过pad_size的设为pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        return (x, seq_len), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            if self.shuffle:
                random.shuffle(self.batches)
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches

class DatasetIterater_no_padding(object):
    def __init__(self, data, batch_size, shuffle, device):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.device = device
        # 按照序列长度对数据进行分类
        self.data_sized = {}
        for item in self.data:
            if item[2] not in self.data_sized:
                self.data_sized[item[2]] = [item]
            else:
                self.data_sized[item[2]].append(item)
        # 将不同序列长度的数据导入进迭代器中
        self.data_iters = []
        self.index = 0
        self.batch_num = 0
        for seq_len in self.data_sized:
            self.data_iters.append(DatasetIterater(self.data_sized[seq_len], batch_size, shuffle, device))
            # 计算batch数量
            self.batch_num += len(self.data_iters[self.index])
            self.index += 1
        self.index = 0
        # 数据index队列
        self.iter_queue = list(range(len(self.data_iters)))

    def __next__(self):
        if self.shuffle:
            # 如果遍历完毕则索引归零，队列重置
            if len(self.iter_queue)==0:
                self.index = 0
                self.iter_queue = list(range(len(self.data_iters)))
                raise StopIteration
            # 随机选取某个序列长度的数据集
            _index = random.randint(0, len(self.iter_queue)-1)
            result = next(self.data_iters[self.iter_queue[_index]])
            # 如果该数据集遍历完毕，则从队列里删除该数据集
            if self.data_iters[self.iter_queue[_index]].index==len(self.data_iters[self.iter_queue[_index]]):
                try:
                    _ = next(self.data_iters[self.iter_queue[_index]])
                except:
                    pass
                del self.iter_queue[_index]
            self.index += 1
            return result
        else:
            # 如果遍历完毕则索引归零，队列重置
            if len(self.iter_queue)==0:
                self.index = 0
                self.iter_queue = list(range(len(self.data_iters)))
                raise StopIteration
            _index = 0
            result = next(self.data_iters[self.iter_queue[_index]])
            # 如果该数据集遍历完毕，则从队列里删除该数据集
            if self.data_iters[self.iter_queue[_index]].index==len(self.data_iters[self.iter_queue[_index]]):
                try:
                    _ = next(self.data_iters[self.iter_queue[_index]])
                except:
                    pass
                del self.iter_queue[_index]
            self.index += 1
            return result

    def __iter__(self):
        return self

    def __len__(self):
        return self.batch_num

def build_iterator(dataset, config):
    if config.padding:
        iter = DatasetIterater(dataset, config.batch_size, config.shuffle, config.device)
    else:
        iter = DatasetIterater_no_padding(dataset, config.batch_size, config.shuffle, config.device)
    return iter


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))



In [None]:

def build_dataset(config, dataset_type='unlabeled'):
    # 读取数据集
    # config.save_path = r"D:\vscode_workspace\database\titles_keywords.json.json"
    if dataset_type == 'unlabeled':
        with open(config.titles_keywords_path, "r", encoding='utf-8') as file:
            titles_keywords = json.load(file)
    else:
        with open(config.classified_titles_keywords_path, "r", encoding='utf-8') as file:
            titles_keywords = json.load(file)
    # 以字分割
    tokenizer = lambda x: [y for y in x]  # char-level
    # 读取词表
    vocab = pkl.load(open(config.vocab_path, 'rb'))
    print(f"Vocab size: {len(vocab)}")

    def load_dataset(dataset_type='unlabeled', padding=False, pad_size=320):
        contents = []
        for title in tqdm(titles_keywords.keys()):
            token = [CLS]
            token.extend(tokenizer(title))
            if dataset_type == 'unlabeled':
                for keyword in titles_keywords[title]:
                    token.append(SEP)
                    token.extend(tokenizer(keyword))
            else:
                for keyword in titles_keywords[title][0]:
                    token.append(SEP)
                    token.extend(tokenizer(keyword))
            seq_len = len(token)
            if padding:
                if len(token) < pad_size:
                    token.extend([PAD] * (pad_size - len(token)))
                else:
                    token = token[:pad_size]
                    seq_len = pad_size
            # word to id
            words_line = []
            for word in token:
                words_line.append(vocab.get(word, vocab.get(UNK)))
            contents.append((words_line, -1 if dataset_type == 'unlabeled' else titles_keywords[title][1], seq_len))
        # 打乱数据集顺序，使得train和dev数据集分布相同
        random.shuffle(contents)
        raw_contents = contents
        ratio = config.train_ratio
        return raw_contents[:int(len(raw_contents)*(ratio))], raw_contents[int(len(raw_contents)*(ratio)):], raw_contents[int(len(raw_contents)*(ratio)):]
        # if data_type=='train':
        #     contents = raw_contents[:int(len(raw_contents)*(ratio))]
        # elif data_type=='dev':
        #     contents = raw_contents[int(len(raw_contents)*(ratio)):]
        # else:
        #     contents = raw_contents[int(len(raw_contents)*(ratio)):]
        # return contents
    train, dev, test = load_dataset(dataset_type=dataset_type, padding=config.padding, pad_size=config.pad_size)
    # dev = load_dataset(dataset_type=dataset_type, padding=config.padding, pad_size=config.pad_size, data_type='dev')
    # test = load_dataset(dataset_type=dataset_type, padding=config.padding, pad_size=config.pad_size, data_type='test')
    # test = load_dataset(padding=config.padding, pad_size=config.pad_size, data_type='test')
    return vocab, train, dev, test

In [2]:
start_time = time.time()
print("Loading data...")
config = Config()

vocab, train_dataset, dev_dataset, test_dataset = build_dataset(config)
_, train_dataset_labeled, dev_dataset_labeled, test_dataset_labeled = build_dataset(config, dataset_type='labeled')

train_iter = build_iterator(train_dataset, config)
# dev_iter = build_iterator(dev_dataset, config)
# test_iter = build_iterator(test_dataset, config)

train_iter_labeled = build_iterator(train_dataset_labeled, config)
# dev_iter_labeled = build_iterator(dev_dataset_labeled, config)
# test_iter_labeled = build_iterator(test_dataset_labeled, config)

config.n_vocab = len(vocab)

Loading data...
Vocab size: 5585


100%|██████████| 430889/430889 [00:06<00:00, 62100.59it/s]


Vocab size: 5585


100%|██████████| 2851/2851 [00:00<00:00, 75168.06it/s]


In [17]:
dev_iter = build_iterator(dev_dataset, config)
dev_iter_labeled = build_iterator(dev_dataset_labeled, config)

In [16]:
len(train_dataset), len(dev_dataset), len(test_dataset)

(387800, 43089, 43089)

In [3]:
model = torch.load(r"/home/liangzida/workspace/ZhongYiPapers/manual_labeling_html/Database/Transformer/saved_dict/18/model_Dacc_74_77_245600.pth")

In [4]:
model = model.to('cuda:0')

In [5]:
id2vocab = dict([val,key] for key,val in vocab.items())

In [6]:
input = next(train_iter_labeled)
input

((tensor([[5580,    2,    7,  ...,   55,   85,   14],
          [5580,    9, 1730,  ...,  867,   58, 1668],
          [5580,  125,   12,  ...,  106,   41,   67],
          ...,
          [5580,    2,    7,  ...,    7,   31,    8],
          [5580,  580,  812,  ..., 5581,   45,   60],
          [5580,   84,    9,  ...,   41,  152,  859]], device='cuda:0'),
  tensor([60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
          60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60],
         device='cuda:0')),
 tensor([1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
         1, 1, 1, 0, 1, 1, 1, 1, 0], device='cuda:0'))

In [7]:
model.eval()
output = model(input[0][0], recover_or_classify='classification')
predic = torch.max(output.data, 1)[1]
predic, input[1]

(tensor([1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
         1, 1, 1, 0, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
         1, 1, 1, 0, 1, 1, 1, 1, 0], device='cuda:0'))

In [8]:
# raw_content = [""]*input[0][0].size(0)
raw_content = {}
for j in range(input[0][0].size(0)):
    tmp = ""
    for i in range(input[0][0].size(1)):
        tmp+=id2vocab[input[0][0][j][i].item()]
    raw_content[tmp] = (input[1][j].item(), predic[j].item())
(raw_content)

{'<CLS>中药对Aβ25-35损伤的SH-SY5Y细胞的保护作用<SEP>阿尔茨海默病<SEP>中药<SEP>β淀粉样蛋白<SEP>SH-SY5Y细胞<SEP>保护作用': (1,
  1),
 '<CLS>血友病甲合并慢性粒细胞白血病1例报告<SEP>血友病甲<SEP>慢性粒细胞白血病<SEP>Ph染色体<SEP>bcr/abl融合基因<SEP>伊马替尼<SEP>羟基脲': (0,
  0),
 '<CLS>灸法治疗慢性盆腔炎研究概况<SEP>慢性盆腔炎<SEP>Inflammatory<SEP>治疗<SEP>灸法<SEP>妇科常见病<SEP>反复腹痛<SEP>月经失调<SEP>生活质量': (1,
  1),
 '<CLS>基于单味药-配伍的合欢花与酸枣仁水提取物对焦虑性抑郁障碍动物模型的影响<SEP>焦虑性抑郁<SEP>行为学<SEP>单胺神经递质<SEP>神经营养物质': (1,
  1),
 '<CLS>胃溃疡及根治幽门螺旋杆菌应用兰索拉唑或雷贝拉唑治疗成本-效果研究<SEP>胃溃疡<SEP>幽门螺旋杆菌<SEP>兰索拉唑<SEP>雷贝拉唑<SEP>成本<SEP>效果': (0,
  0),
 '<CLS>基于网络药理学及分子对接技术研究泽漆萜类成分抗非小细胞肺癌作用机制<SEP>网络药理学<SEP>分子对接<SEP>非小细胞肺癌<SEP>泽漆<SEP>萜类成分': (0,
  0),
 '<CLS>利水化瘀方在乳头内陷矫正术后乳头组织肿胀中的应用研究<SEP>乳头内陷<SEP>去表皮双菱形皮瓣推进法<SEP>真皮瓣<SEP>中医药疗法<SEP>消肿<SEP>化瘀': (1,
  1),
 '<CLS>1003例慢性乙型肝炎(ALT≥2×ULN)患者中医常见症状及证候分布特点研究<SEP>慢性乙型肝炎<SEP>证候<SEP>常见症状<SEP>分布特点': (1,
  1),
 '<CLS>黄芪甲苷治疗溃疡性结肠炎和肝脏损伤共病的分子机制网络药理学研究<SEP>黄芪甲苷<SEP>溃疡性结肠炎<SEP>肝脏损伤<SEP>网络药理学<SEP>作用机制': (1,
  1),
 '<CLS>川芎嗪注射液对梗阻性黄疸肾缺血再灌注损伤大鼠肾功能及血清TNF-α的影响<SEP>胆汁淤积<SEP>再灌注损伤<SEP>肾<SEP>肿瘤坏死因子<SEP>大鼠': (1,


In [9]:
input = next(train_iter)
model.eval()
output = model(input[0][0], recover_or_classify='classification')
predic = torch.max(output.data, 1)[1]
raw_content = {}
for j in range(input[0][0].size(0)):
    tmp = ""
    for i in range(input[0][0].size(1)):
        tmp+=id2vocab[input[0][0][j][i].item()]
    raw_content[tmp] = predic[j].item()
(raw_content)

{'<CLS>补肾法对支气管哮喘慢性持续期肾虚证患者CD4+CD25+Foxp3+Treg细胞及生活质量的影响<SEP>支气管哮喘<SEP>肾虚证<SEP>补肾法<SEP>喘可治<SEP>CD4+CD25+Foxp3+Treg细胞<SEP>生活质量': 1,
 '<CLS>益髓解毒法对血管性痴呆大鼠海马神经元Caspase-3、Caspase-9 mRNA及蛋白表达的影响<SEP>益髓解毒<SEP>血管性痴呆<SEP>Caspase-3<SEP>Caspase-9<SEP>细胞凋亡<SEP>海马<SEP>神经元': 1,
 '<CLS>糖肾平通过TGF-β1-Smad2/3-ILK信号通路干预高糖+LPS诱导足细胞上皮间质转分化的分子机制研究 <SEP>糖肾平<SEP>足细胞<SEP>TGF-β1-Smad2/3-ILK<SEP>信号转导通路<SEP>转分化': 0,
 '<CLS>痛泻要方对肝郁脾虚型UC大鼠结肠组织Claudin-1、ZO-1及血清COX-2、iNOS表达的影响<SEP>痛泻要方<SEP>溃疡性结肠炎<SEP>密封蛋白1<SEP>紧密连接蛋白1<SEP>环氧合酶-2<SEP>诱导型一氧化氮合酶': 1,
 '<CLS>桃红四物汤对糖尿病血管病变大鼠血清VEGF、sLOX-1、GLP-1水平的影响<SEP>桃红四物汤<SEP>糖尿病血管病变<SEP>大鼠<SEP>内皮生长因子<SEP>可溶性凝集素样氧化型低密度脂蛋白受体-1<SEP>胰高血糖素样肽1': 1,
 '<CLS>基于CiteSpace对Web of Science近10年老年领域Meta分析的计量学及可视化分析<SEP>老年<SEP>Meta分析<SEP>Web<SEP>of<SEP>Science<SEP>CiteSpace<SEP>研究现状<SEP>热点': 1,
 '<CLS>4-羟基芝麻素通过抑制p38MAPK/NLRP3信号通路减轻BV2小胶质细胞神经炎症损伤<SEP>4-羟基芝麻素<SEP>BV2小胶质细胞<SEP>神经炎症损伤<SEP>p38MAPK/NLRP3<SEP>缺血性脑卒中<SEP>脂多糖': 1,
 '<CLS>基于UPLC-Q-TOF-MS和UPLC-DAD的不同品种溪黄草主要化学成分分析<SEP>溪黄草<SEP>UPLC-Q-TOF-MS<SEP>UPLC

In [10]:
input = next(train_iter)
model.eval()
output = model(input[0][0], recover_or_classify='classification')
predic = torch.max(output.data, 1)[1]
raw_content = {}
for j in range(input[0][0].size(0)):
    tmp = ""
    for i in range(input[0][0].size(1)):
        tmp+=id2vocab[input[0][0][j][i].item()]
    raw_content[tmp] = predic[j].item()
zhongyi = []
xiyi = []
for content in (raw_content.keys()):
    if raw_content[content]==1:
        zhongyi.append(content)
    else:
        xiyi.append(content)
    # print(raw_content[content], ' ', content)
print("中医：")
for item in zhongyi:
    print(item)
print("非中医：")
for item in xiyi:
    print(item)

中医：
<CLS>中药白秓合剂对HaCaT细胞CXCR2蛋白表达及TNF-α诱导的HaCaT细胞IL-6表达的影响<SEP>白疕合剂<SEP>人永生化表皮角质形成细胞<SEP>肿瘤坏死因子-α<SEP>白细胞介素-6<SEP>CXC趋化因子受体2
<CLS>外源茉莉酸甲酯和赤霉素对茜草生长、相关酶活性及主要活性成分含量的影响研究<SEP>茜草<SEP>茉莉酸甲酯<SEP>赤霉素<SEP>抗氧化酶<SEP>渗透调节物质<SEP>茜草素<SEP>羟基茜草素<SEP>1<SEP>8-二羟基蒽醌<SEP>甲基异茜草素<SEP>大叶茜草素
<CLS>补肾活血汤含药血清干预体外培养大鼠骨髓间充质干细胞成软骨分化及补肾活血汤联合骨髓间充质干细胞治疗大鼠膝骨关节炎的实验研究<SEP>骨关节炎<SEP>骨髓<SEP>间质干细胞<SEP>软骨<SEP>关节<SEP>补肾活血汤<SEP>大鼠<SEP>动物实验
<CLS>针刀松解法对第三腰椎横突综合征模型大鼠脊髓背角POMC mRNA、PPE mRNA阳性细胞表达的影响 <SEP>针刀松解法<SEP>第三腰椎横突综合征<SEP>内源性阿片肽<SEP>脊髓背角<SEP>电针<SEP>前阿黑皮素<SEP>前脑非肽原
<CLS>参葵通脉颗粒对慢性心衰心肌重塑及TGF-β1、p-Smad3表达影响的实验研究<SEP>慢性心衰<SEP>参葵通脉颗粒<SEP>心肌<SEP>SD大鼠<SEP>实验研究<SEP>血流动力学<SEP>心肌转化生长因子-β1、p-Smad3蛋白表达
<CLS>益生菌Faecalibacterium prausnitzii联合溃克灵对大鼠结肠炎的预防作用<SEP>炎症性肠病<SEP>中药<SEP>溃克灵<SEP>Faecalibacterium<SEP>prausnitzii<SEP>细胞因子
<CLS>颈腰痛煎剂Ⅱ号联合牵引治疗腰椎间盘突出症随机平行对照研究<SEP>腰椎间盘突出症<SEP>腰痛<SEP>痹症<SEP>颈腰痛煎剂Ⅱ号<SEP>对抗持续牵引<SEP>直腿抬高试验<SEP>加强试验<SEP>股神经牵拉试验阳性<SEP>中医药治疗<SEP>随机平行对照研究
<CLS>益气养阴活血法对脓毒血症模型大鼠血清TNF-α、IL-6与IL-10的影响<SEP>脓毒症休克<SEP>脓毒血症模型<SEP>益气养阴活血法<SEP>

In [11]:
input = next(train_iter)
model.eval()
zhongyi = 0
sum = 0
for input in train_iter:
    output = model(input[0][0], recover_or_classify='classification')
    predic = torch.max(output.data, 1)[1]
    for j in range(input[0][0].size(0)):
        if predic[j].item()==1:
            zhongyi+=1
        sum+=1
test_iter = build_iterator(test_dataset, config)
for input in train_iter:
    output = model(input[0][0], recover_or_classify='classification')
    predic = torch.max(output.data, 1)[1]
    for j in range(input[0][0].size(0)):
        if predic[j].item()==1:
            zhongyi+=1
        sum+=1
    # raw_content = {}
    # for j in range(input[0][0].size(0)):
    #     tmp = ""
    #     for i in range(input[0][0].size(1)):
    #         tmp+=id2vocab[input[0][0][j][i].item()]
    #     raw_content[tmp] = predic[j].item()
    # zhongyi = []
    # xiyi = []
    # for content in (raw_content.keys()):
    #     if raw_content[content]==1:
    #         zhongyi.append(content)
    #     else:
    #         xiyi.append(content)
    #     # print(raw_content[content], ' ', content)
    # print("中医：")
    # for item in zhongyi:
    #     print(item)
    # print("非中医：")
    # for item in xiyi:
    #     print(item)

In [12]:
zhongyi, sum, zhongyi/sum

(552904, 773233, 0.7150548411668928)