In [2]:
import os
import re

# SOURCE_FILE = "中国现代名中医医案精华（一）.txt"
SOURCE_FILE = '中国现代名中医医案精华.txt'

def preprocess(lines):
    '''
    @description: 预处理，删除无用行
    @param {list} lines: 每一行的内容
    @return {string}: 预处理后的内容
    '''

    # 移除仅包含数字的一行
    final_line = []
    for line in lines:
        line = line.strip()
        line += '\n'
        reg = re.compile(r'^\d+\n')
        province_reg = re.compile(r'.*(浙江|天津|山东|山西|江西|江苏|广东|贵州|福建|安徽).*')
        if re.match(reg, line):
           continue
        if re.match(province_reg, line):
            continue
        if line == '\n':
            continue
        final_line.append(line) 
    
    result = "".join(final_line)
    result = result.replace(" ", "") # 删除空格
    result = result.replace("　", "") # 删除全角空格
    result = result.replace("\t", "") # 删除制表符
    
    return result


def extract_from_author(content, author_name, ROOT_DIR, TARGET_DIR):
    '''
    @param content : 文本内容
    @param author_name : 作者名
    @param ROOT_DIR : 根目录
    @param TARGET_DIR : 目标目录
    @return {int} : 医案数量
    '''
    # 按照`一例` `两例`划分医案大类
    # 按照`例一` `例二`划分医案小类
    count = 0
    start_list = []
    record_list = []
    reg = re.compile(r"(?:.*)[一二三四五六七八九]例\n")
    for m in re.finditer(reg, content):
        if m.start() > 0:
            start_list.append(m.start())
    for m in range(len(start_list)):
        if m == len(start_list) - 1:
            record_list.append(content[start_list[m]:])
        else:
            record_list.append(content[start_list[m]:start_list[m+1]])

    count = 0
    for i in range(len(record_list)):
        first_line = record_list[i].split("\n")[0]
        # 删除`一例` `两例`，以获取疾病名称
        end_reg = re.compile(r"(?:,*)([一二三四五六七八九十]|十[一二三四五六七八九])例\n")

        # 获取疾病名称
        disease_name = re.sub(end_reg, "", first_line)
        if disease_name.find('治愈') != -1:
            disease_name = disease_name[disease_name.find('治愈')+2:]
        elif disease_name.find('治疗') != -1:
            disease_name = disease_name[disease_name.find('治疗')+2:]
        elif disease_name.find('治') != -1:
            disease_name = disease_name[disease_name.find('治')+1:]

        if disease_name.find('例') != -1:
            disease_name = disease_name[:disease_name.find('例')-1]

        if record_list[i].find('一例') != -1:
            with open(os.path.join(ROOT_DIR, TARGET_DIR, author_name + '-' + disease_name + ".txt"), "w", encoding="utf-8") as f:
                f.write(record_list[i])
                count += 1
            continue

        # 按照`例一` `例二`划分医案小类
        start_list_record = []
        record_list_record = []
        reg_record = re.compile(r"\n例[一二三四五六七八九].*\n")
        for m in re.finditer(reg_record, record_list[i]):
            if m.start() > 0:
                start_list_record.append(m.start())
        for m in range(len(start_list_record)):
            if m == len(start_list_record) - 1:
                record_list_record.append(record_list[i][start_list_record[m]:])
            else:
                record_list_record.append(record_list[i][start_list_record[m]:start_list_record[m+1]])
        
        for j in range(len(record_list_record)):
            file_name = author_name + '-' + disease_name
            if len(record_list_record) > 1:
                file_name += "-" + str(j+1) + ".txt"

            with open(os.path.join(ROOT_DIR, TARGET_DIR, file_name), "w", encoding="utf-8") as f:
                f.write(record_list_record[j])
                count += 1
    return count


def extract(content):
    '''
    @description: 抽取医案
    @param {string} content: 预处理后的内容
    @return {None}:
    '''
    ROOT_DIR = 'Record Collections MZY'
    TARGET_DIR = 'Raw Medical Record'
    if not os.path.exists(ROOT_DIR):
        os.mkdir(ROOT_DIR)
    if not os.path.exists(os.path.join(ROOT_DIR, TARGET_DIR)):
        os.mkdir(os.path.join(ROOT_DIR, TARGET_DIR))

    # 清空目标文件夹
    for file in os.listdir(os.path.join(ROOT_DIR, TARGET_DIR)):
        os.remove(os.path.join(ROOT_DIR, TARGET_DIR, file))

    author_reg = re.compile(r"\n.*医案\n")
    author_start_list = []
    author_record_list = []
    for m in re.finditer(author_reg, content):
        if m.start() > 0:
            author_start_list.append(m.start())
    for m in range(len(author_start_list)):
        if m == len(author_start_list) - 1:
            author_record_list.append(content[author_start_list[m]:])
        else:
            author_record_list.append(content[author_start_list[m]:author_start_list[m+1]])
    
    AUTHOR_DIR = 'Author Record'
    if not os.path.exists(os.path.join(ROOT_DIR, AUTHOR_DIR)):
        os.mkdir(os.path.join(ROOT_DIR, AUTHOR_DIR))
    for file in os.listdir(os.path.join(ROOT_DIR, AUTHOR_DIR)):
        os.remove(os.path.join(ROOT_DIR, AUTHOR_DIR, file))
    count = 0
    for i in range(len(author_record_list)):
        author_name = author_record_list[i].split("\n")[1]
        author_name = author_name.replace("医案", "")
        with open(os.path.join(ROOT_DIR, AUTHOR_DIR, author_name + ".txt"), "w", encoding="utf-8") as f:
            f.write(author_record_list[i])

        count += extract_from_author(author_record_list[i], author_name, ROOT_DIR, TARGET_DIR)

    print('医案抽取完成，共抽取{}个医案， 存放在{}目录下'.format(count, os.path.join(ROOT_DIR, TARGET_DIR)))


with open(SOURCE_FILE, "r", encoding="utf-8") as f:
    lines = f.readlines()
    content = preprocess(lines) # 预处理
    extract(content) # 抽取医案

医案抽取完成，共抽取559个医案， 存放在Record Collections MZY\Raw Medical Record目录下


In [3]:
# 生成带有标签的，用于导入的医案

def tag_generate(type, content, import_content):
    ''' 
    @param {string} type: 标签类型
    @param {string} content: 医案内容
    @param {string} import_content: 导入内容
    @return {string} import_content: 导入内容
    '''
    reg = re.compile(r"(辩证|辨证)[：:.。].*\n")
    if type == 'cure':
        reg = re.compile(r"(治法|治则)[：:.].*\n")
    for m in re.finditer(reg, content):
        dia_content = m.group()
        dia_content = dia_content[3:]
        dia_content = dia_content.strip()
        dia_content = dia_content.replace("。", "")
        dia_list = dia_content.split(",")
        for dia in dia_list:
            tmp_list = dia.split('，')
            for tmp in tmp_list:
                import_content += type + '-' + tmp + '\n'
    return import_content


def create_import_record():
    ROOT_DIR = 'Record Collections MZY'
    TARGET_DIR = 'Import Medical Record'
    if not os.path.exists(ROOT_DIR):
        os.mkdir(ROOT_DIR)
    if not os.path.exists(os.path.join(ROOT_DIR, TARGET_DIR)):
        os.mkdir(os.path.join(ROOT_DIR, TARGET_DIR))

    # 清空目标文件夹
    for file in os.listdir(os.path.join(ROOT_DIR, TARGET_DIR)):
        os.remove(os.path.join(ROOT_DIR, TARGET_DIR, file))

    for file in os.listdir(os.path.join(ROOT_DIR, 'Raw Medical Record')):
        with open(os.path.join(ROOT_DIR, 'Raw Medical Record', file), "r", encoding="utf-8") as f:
            origin_content = f.readlines()
            origin_content = ''.join(origin_content)

            import_content = ''
            import_content += '#book\n'
            import_content += '中国现代名中医医案精华\n'
            import_content += '#start\n'

            # 抽取'辩证'标签
            import_content = tag_generate('dia', origin_content, import_content)
            import_content = tag_generate('cure', origin_content, import_content)
            import_content += '#end\n'
            import_content += origin_content

            with open(os.path.join(ROOT_DIR, TARGET_DIR, file), "w", encoding="utf-8") as f:
                f.write(import_content)

create_import_record()

In [3]:
import pandas as pd
import os

# 生成csv文件
# 第一列为医案内容
# 后面的所有列为标签
# 一行为一个医案
# 一列为一个标签
def generate_csv(root_directory, source_dir, csv_name):
    # 使用pd.DataFrame存储数据
    data = pd.DataFrame(columns=['text', 'label'])

    # 读取文件夹下的所有文件
    files = os.listdir(os.path.join(root_directory, source_dir))
    for file in files:
        if os.path.isdir(os.path.join(root_directory, source_dir, file)):
            sub_files = os.listdir(os.path.join(root_directory, source_dir, file))
            for sub_file in sub_files:
                # 打开文件
                fp = open(os.path.join(root_directory, source_dir, file, sub_file), mode='r', encoding='utf-8')
                content = fp.read()
                fp.close()

                # 读取文件中的标签, 从#start到#end之间的内容
                start = content.find('#start')+6
                end = content.find('#end')
                tags = content[start:end].split(' ')
                text = content[end+4:]
                
                # 将数据存储到data中
                data = pd.concat([data, pd.DataFrame({'text': [text], 'label': tags})], ignore_index=True)
        else:
            # 忽略.DS_Store文件
            if file == '.DS_Store':
                continue
            # 打开文件
            fp = open(os.path.join(root_directory, source_dir, file), mode='r', encoding='utf-8')
            content = fp.read()
            fp.close()

            # 读取文件中的标签, 从#start到#end之间的内容
            start = content.find('#start')+7 # 忽略第一个换行符
            end = content.find('#end')
            tags = content[start:end]
            text = content[end+4:]

            # 将数据存储到data中
            data = pd.concat([data, pd.DataFrame({'text': [text], 'label': tags})], ignore_index=True)
        
    # 保存为csv文件
    data.to_csv(os.path.join(root_directory, csv_name), index=False)
    print('已成功生成csv文件，路径为{}'.format(os.path.join(root_directory, csv_name)))

generate_csv('Record Collections MZY', 'Refined Medical Record', 'medical_record_mzy.csv')
            

已成功生成csv文件，路径为Record Collections MZY\medical_record_mzy.csv


In [12]:
# 添加疾病标签
import os

def create_disease_tag():
    ROOT_DIR = "Record Collections MZY"
    SOURCE_FILE = "Refined Medical Record"
    path = os.path.join(ROOT_DIR, SOURCE_FILE)

    count = 0
    for file in os.listdir(path):
        with open(os.path.join(path, file), "r", encoding="utf-8") as f:
            if file == '.DS_Store':
                continue
            
            disease_tag = file.split('-')[1]
            disease_tag = disease_tag.split('.')[0]
            # 判断是否已经添加了疾病标签
            content = f.read()
            if content.find('disease') != -1:
                continue
            # 添加疾病标签
            content = content.replace('#start\n', '#start\n#disease-' + disease_tag + '\n')
            with open(os.path.join(path, file), "w", encoding="utf-8") as f:
                f.write(content)
            count += 1
    print('已成功添加{}个疾病标签'.format(count))

create_disease_tag()

已成功添加549个疾病标签


In [3]:
import os
def remove_ZWNBSP(SOURCE_FILE):
    ROOT_DIR = "Record Collections MZY"
    
    path = os.path.join(ROOT_DIR, SOURCE_FILE)

    for file in os.listdir(path):
        with open(os.path.join(path,file), 'r', encoding='utf-8') as f:
            if file == '.DS_Store':
                continue
            content = f.read()
            content = content.replace('\uFEFF', '')
            # 删除该文件
            os.remove(os.path.join(path,file))

            file = file.replace('\uFEFF', '')
            with open(os.path.join(path,file), 'w', encoding='utf-8') as f:
                f.write(content)
            


SOURCE_FILE = "Refined Medical Record"
remove_ZWNBSP(SOURCE_FILE)
SOURCE_FILE = "Import Medical Record"
remove_ZWNBSP(SOURCE_FILE)

PermissionError: [WinError 32] 另一个程序正在使用此文件，进程无法访问。: 'Record Collections MZY\\Refined Medical Record\\万友生-失眠-1.txt'