In [1]:
import pandas as pd 
import jieba 
import os
import time 
import math

In [2]:
class Medical_Research:

    def __init__(self, scale=1):  
        self.docs = []
        self.load_data()
        self.docs *= scale
        self.cache = {} # 增加词典作为缓存
        self.vocab = set()
        self.df = {} # 存储每个词的词频
        self.avgdl = 0 
        self.sum_N = len(self.docs) # 表示文本集合中文本的总数量
        self.dictionary_prepare()
        self.build_cache()
        
    def load_data(self):
        time_start=time.time() # time
        
        final_homework = pd.read_csv('data/medical_text_data.csv',encoding='gbk',names=['症状','诊断'])
        final_homework = final_homework.fillna('') # 诊断有一个值为空
        data = pd.concat( [final_homework['症状'],final_homework['诊断']],axis=0).reset_index(drop=True)
        self.docs = [d.replace(' ','') for d in data.values.tolist()]
        
        time_end=time.time() # time
        print('【load_data】time cost:',time_end-time_start,'s') # time
    
    def dictionary_prepare(self):
        time_start=time.time() # time
        
        jieba.load_userdict('dictionary/dict_medical.txt')
        jieba.load_userdict('dictionary/add_dict.txt')
        # 医学辞典
        jieba.load_userdict('dictionary/chinese_medical_words-master/emr.txt')
        jieba.load_userdict('dictionary/chinese_medical_words-master/medicine.txt')
        jieba.load_userdict('dictionary/chinese_medical_words-master/properties.txt')

        time_end=time.time() # time
        print('【dictionary_prepare】time cost:',time_end-time_start,'s') # time
    
    def cut(self,phrase,cut_all=False):
        if cut_all:
            return jieba.cut(phrase, cut_all=True)
        return jieba.cut(phrase)
        
    def build_cache(self):
        time_start=time.time() # time
        
        doc_id = 0
        doc_length_sum = 0
        for doc in self.docs: 
            doc_length_sum += len(doc)
            for word in set(self.cut(doc,cut_all=True)) :# 一句话中一个词只算一次 16.
#             for word in set(self.cut(doc,cut_all=False)) :# 一句话中一个词只算一次 32.8
                result_item = doc_id
                if word not in self.vocab:
                    self.cache[word] = set([result_item])
                    self.df[word] = 1
                    self.vocab.add(word)
                else:
                    self.cache[word].add(result_item) 
                    self.df[word] += 1
            doc_id += 1
        self.avgdl = doc_length_sum / self.sum_N
        
        time_end=time.time() # time
        print('【build_cache】time cost:',time_end-time_start,'s') # time
        
    def score(self,doc,query_lst,k1=2, b=0.75): # BM25
        score = 0
        cut_query = []
        if len(query_lst) == 1: # 属于单个名词的查询
            cut_query = list(self.cut(query_lst[0]))
        elif len(query_lst) > 1: # 属于布尔复合查询
            for i in range(len(query_lst)):
                cut_query += list(self.cut(query_lst[i]))
        
        dl = len(doc)
        for keyword in cut_query:
            if keyword in doc: # doc中的每个词只计算一次
                f = doc.count(keyword)
                tf = f*(k1+1)/(f+k1*(1-b+b*(dl/self.avgdl)))
                idf = math.log10((self.sum_N-self.df[keyword]+0.5)/(self.df[keyword] + 0.5))
                score +=  tf * idf
        return score 
    
    def rank(self, query_lst, result_set):
        # 对简单查询和布尔查询检索结果根据score进行排序
        result = []
        for doc_id in result_set:
            result.append([doc_id, self.score(self.docs[doc_id], query_lst)])
        result.sort(key=lambda x:x[1], reverse=True)
        return result 
    
    def get_phrase_match(self,phrase):
        result = None
        cut_phrase = self.cut(phrase)
        cache_temp = self.cache
        for keyword in cut_phrase:
            if keyword in cache_temp:
                if result == None:
                    result = cache_temp[keyword]
                else:
                    result = result & cache_temp[keyword]
            else:
                result = set([])
        if result is None:
            result = set([])
        return result
    
    def conv_query(self,query):
        query += ' '
        qlen = len(query)
        result_parts = []
        idx = 0
        cache = ''
        keywords = list()
        while idx < qlen:
            if query[idx] in ('(',')',' '):
                if cache != '':
                    if cache.lower() == 'and':
                        result_parts.append('&')
                    elif cache.lower() == 'or':
                        result_parts.append('|')
                    elif cache.lower() == 'not':
                        result_parts.append('-')
                    else:
                        result_parts.append("self.get_phrase_match('{}')".format(cache))
                        keywords.append(cache)
                    cache = ''
                result_parts.append(query[idx])
            else:
                cache += query[idx]
            idx += 1
        return "".join(result_parts),keywords
    
    def search(self,query):
        # 处理布尔查询语句，分词，处理'(华为 or 苹果) and 手机 ' 的形式
        if ('and' in query) or ('or' in query) or ('not' in query):
            result_parts, query_keywords = self.conv_query(query)
            result_set = eval(result_parts)
        # 处理单关键词查询
        else:
            result_set = self.get_phrase_match(query)
            query_keywords = [query]
            
        sort_result = self.rank(query_keywords,result_set)
        return sort_result   
    
    def rener_research_result(self,query,result_print=False):
        time_start=time.time() # time
        results = self.search(query)
        print('The length of results is ',len(results))
        time_end=time.time() # time
        print('【search】time cost:',time_end-time_start,'s\n\n') # time
        
        if result_print:
            for result in results:
                sentence = self.docs[result[0]]
                for keyword in self.cut(query):
                    if keyword not in ('(',')',' '):
                        sentence = sentence.replace(keyword,'【{}】'.format(keyword)).strip()
                print("doc_id:{}, score:{},\ndoc:{}\n".format(result[0],result[1],sentence))

In [3]:
%prun mr = Medical_Research()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\DELL\AppData\Local\Temp\jieba.cache


【load_data】time cost: 0.34012722969055176 s


Loading model cost 0.709 seconds.
Prefix dict has been built successfully.


【dictionary_prepare】time cost: 4.425678968429565 s
【build_cache】time cost: 20.392681121826172 s
 

         32397608 function calls (32397442 primitive calls) in 25.164 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   584117    7.956    0.000    8.894    0.000 __init__.py:180(get_DAG)
        1    3.501    3.501   20.393   20.393 2666504946.py:44(build_cache)
  3406411    3.202    0.000   12.503    0.000 __init__.py:198(__cut_all)
  3917886    2.638    0.000   18.819    0.000 __init__.py:289(cut)
  2668857    1.249    0.000    1.249    0.000 {method 'match' of 're.Pattern' objects}
  7307980    0.753    0.000    0.753    0.000 {method 'append' of 'list' objects}
        1    0.707    0.707    0.707    0.707 {built-in method marshal.load}
  2767189    0.624    0.000    0.624    0.000 {method 'add' of 'set' objects}
   639361    0.560    0.000    0.560    0.000 {method 'split' of 're.Pattern' objects}
4982893/4982869    0.478    0.000    0.478    0.000 {built-in method builtins.len}
   816230    0.469    0.000    0.645  

rener_research_result函数为查询函数

将展示根据关键词/布尔查询语句检索到的条目数、检索时间等信息，可选择是否打印检索结果

q: 查询query，为字符串类型，可通过and、or自定义布尔查询条件

result_print: 是否打印结果

In [4]:
# 布尔查询 + 打印结果
q = '(左肺 or 右肺) and 间质性'
a = mr.rener_research_result(q,result_print=True)

The length of results is  161
【search】time cost: 0.012965679168701172 s


doc_id:43140, score:4.180014577067371,
doc:双肺【间质性】渗出改变，注意【间质性】肺炎。
【右肺】中、下叶钙化灶。
双侧胸膜局限增厚。

doc_id:50439, score:4.095018217420474,
doc:双肺【间质性】改变，注意【间质性】肺炎。
【右肺】中叶少许慢性炎症。
左肾占位，建议进一步检查。

doc_id:47773, score:3.87805908934584,
doc:双肺【间质性】改变并【间质性】炎症。
双肺散在少许气肿。
【右肺】可见散在多发小结节，建议随诊复查。
右侧胸膜增厚钙化。

doc_id:26553, score:3.734103236864826,
doc:【右肺】上叶占位，性质待定，建议进一步增强检查。
双肺胸膜下【间质性】病变，注意【间质性】肺炎。
双肺散在小叶中心型及间隔旁型肺气肿。

doc_id:28482, score:3.550150452149468,
doc:【右肺】下叶炎症。
双肺多叶、段【间质性】病变。

doc_id:40433, score:3.550150452149468,
doc:双肺散在【间质性】炎症。
【右肺】上叶泡性气肿。

doc_id:30542, score:3.460990280761105,
doc:双肺多发【间质性】炎症，请结合临床。
【左肺】下叶外侧底段小结节，随诊复查。

doc_id:40464, score:3.4262197580313494,
doc:双肺多发【间质性】炎症，以【右肺】为著。建议抗炎后复查。

doc_id:49570, score:3.4063078212327875,
doc:双肺多发【间质性】病变，【左肺】上叶为著。
左侧胸腔积液。
纵隔间隙淋巴结增大。

doc_id:34156, score:3.3388050622734324,
doc:双肺多发炎症、部分实变。
【右肺】中叶内侧段【间质性】肺气肿。

doc_id:42571, score:3.327033548831792,
doc:【右肺】胸膜下斑片影，【间质性】改变？
【左肺】下叶

In [5]:
# 关键词检索，不打印结果，仅显示条数及查询时间
q = '条状钙化影'
mr.rener_research_result(q,result_print=False)

The length of results is  6
【search】time cost: 0.0009911060333251953 s


