In [10]:
import os
import pandas as pd
import nltk
import ahocorasick
import re

# 加载疾病词典
def load_disease_dict_from_csv(disease_dict_path):
    df = pd.read_csv(disease_dict_path)
    diseases = df['disease_name'].dropna().unique().tolist()
    diseases = [f' {disease.strip().lower()} ' for disease in diseases]
    return diseases

# 从文本文件加载微生物字典
def load_microbe_dict(file_path):
    microbes = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            microbes.append(f' {line.strip().lower()} ')  # 加入空格防止匹配子串
    return microbes

# 构建Aho-Corasick自动机
def build_automaton(entity_list):
    automaton = ahocorasick.Automaton()
    sorted_entity_list = sorted(entity_list, key=len, reverse=True)  # 长实体优先
    for idx, entity in enumerate(sorted_entity_list):
        automaton.add_word(entity, (idx, entity))  # 小写加空格形式存入自动机
    automaton.make_automaton()
    return automaton

# 替换标点符号为单个空格
def replace_punctuation_with_space(text):
    return re.sub(r'[^\w\s]', ' ', text)  # 替换所有标点符号为单个空格

# 清理实体：去掉标点符号
def clean_entity(entity):
    return re.sub(r'[^\w\s]', '', entity).strip()  # 去掉标点符号

# 实体识别（直接在原始句子中匹配）
def extract_entities_with_aho_corasick(automaton, sentence):
    entities = []
    sentence_modified = replace_punctuation_with_space(sentence).lower()  # 替换标点符号为单个空格
    for end_index, (insert_order, entity) in automaton.iter(sentence_modified):
        start_index = end_index - len(entity) + 1
        matched_entity = sentence[start_index:end_index + 1]
        clean_matched_entity = clean_entity(matched_entity)  # 清理实体，去掉标点
        if clean_matched_entity:  # 如果清理后的实体不为空，则添加到结果中
            entities.append(clean_matched_entity)
    entities = sorted(set(entities), key=len, reverse=True)  # 去重并按长度排序
    return entities

# 处理单个文件
def process_file(file_path, disease_automaton, microbe_automaton):
    all_results = []
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        sentences = nltk.sent_tokenize(text)  # 句子切分
        for sentence in sentences:
            diseases = extract_entities_with_aho_corasick(disease_automaton, sentence)
            microbes = extract_entities_with_aho_corasick(microbe_automaton, sentence)
            if diseases and microbes:
                for disease in diseases:
                    for microbe in microbes:
                        all_results.append({
                            'DISEASE': disease,
                            'MICROBE': microbe,
                            'EVIDENCE': sentence.strip(),  # 返回原始句子
                            'QUESTIONS': f"What is the relation between {disease} and {microbe}?"
                        })
    return all_results

In [11]:
# 输入和输出目录
input_directory = "/home/GSX/text_mining/pub-abstracts/"  # 预处理后的文本文件目录
output_csv_path = "/home/GSX/text_mining/pub-abstracts/D-M-SSC.csv"  # 输出结果CSV文件路径

# 微生物字典文件路径
microbe_dict_path = '/home/GSX/text_mining/microbe_names.txt'  # 微生物名称词典路径
disease_dict_path = '/home/GSX/text_mining/disease_dict.csv'  # 疾病名称词典路径

# 加载微生物字典
microbe_dict = load_microbe_dict(microbe_dict_path)
# 加载疾病字典
disease_dict = load_disease_dict_from_csv(disease_dict_path)
print(f"疾病词典大小: {len(disease_dict)}")
print(f"微生物词典大小: {len(microbe_dict)}")

疾病词典大小: 2287
微生物词典大小: 8304


In [12]:
# 创建疾病的Aho-Corasick自动机
disease_automaton = build_automaton(disease_dict)

# 创建微生物的Aho-Corasick自动机
microbe_automaton = build_automaton(microbe_dict)

In [13]:
# 用于存储所有文件的识别结果
all_results = []

target_files = ["all-microbe-abstracts.txt"]

# 遍历目录下的所有文件
for filename in os.listdir(input_directory):
    if filename in target_files:  
        file_path = os.path.join(input_directory, filename)
        
        # 处理文件
        file_results = process_file(file_path, disease_automaton, microbe_automaton)
        all_results.extend(file_results)

# 创建DataFrame并保存为CSV文件
df = pd.DataFrame(all_results)
# 去重和清理
df['DISEASE'] = df['DISEASE'].str.strip()  # 保留原始形式
df['MICROBE'] = df['MICROBE'].str.strip()
df['EVIDENCE'] = df['EVIDENCE'].str.strip()
df['QUESTIONS'] = df['QUESTIONS'].str.strip()
# 大小写无关去重
df_lower = df.copy()  # 创建副本用于大小写统一
df_lower['DISEASE'] = df_lower['DISEASE'].str.lower()
df_lower['MICROBE'] = df_lower['MICROBE'].str.lower()
df = df.loc[df_lower.drop_duplicates(subset=['DISEASE', 'MICROBE', 'EVIDENCE']).index]  # 根据去重后的索引保留原始数据

# 如果有识别结果，保存到CSV文件
if not df.empty:
    df.to_csv(output_csv_path, index=False, columns=['DISEASE', 'MICROBE', 'EVIDENCE', 'QUESTIONS'])
    print(f"识别结果已保存到: {output_csv_path}")
else:
    print("未识别到任何疾病和微生物实体对。")

识别结果已保存到: /home/GSX/text_mining/pub-abstracts/D-M-SSC.csv
