In [23]:
import os
import re

TARGET_DIR = "MHP Record Collections"
SOURCE_FILE = "茅汉平-名老中医临证验案集.txt"

def preprocess(lines):
    '''
    @description: 预处理，删除无用行
    @param {list} lines: 每一行的内容
    @return {string}: 预处理后的内容
    '''
    result = "".join(lines)
    result = result.replace(" ", "")

     # 使用正则表达式匹配以•开头并且以•结尾的行
    reg = re.compile(r"•.*•\n")
    result = re.sub(reg, "", result)

    # 删除以•开头或结尾且中间包括数字的行
    reg = re.compile(r"•\d+.*\n")
    result = re.sub(reg, "", result)
    reg = re.compile(r"\d+•\n")
    result = re.sub(reg, "", result)

    # 删除特殊符号
    result = result.replace("•", "")
    result = result.replace("○", "")

     # 删除包含'汉平'的行
    reg = re.compile(r".*汉平.*")
    result = re.sub(reg,"", result)

    # 删除包含'老中医'的行
    reg = re.compile(r".*老中医.*")
    result = re.sub(reg,"", result)

    reg = re.compile(r'.*第二篇.*')
    result = re.sub(reg,"", result)

    return result


def extract(content):
    '''
    @description: 根据规则抽取文件
    @param {string} content: 预处理后的内容
    '''
    ROOT_DIR = 'Record Collections MFP'
    TARGET_DIR = 'Raw Medical Record'
    if not os.path.exists(ROOT_DIR):
        os.mkdir(ROOT_DIR)
    if not os.path.exists(os.path.join(ROOT_DIR, TARGET_DIR)):
        os.mkdir(os.path.join(ROOT_DIR, TARGET_DIR))

    # 按`（一）`来分隔病情大类
    # 按`岁`来分隔病情小类
    reg = re.compile(r'\n[（(](?:[一二三四五六七八九]|十[一二三四五六七八九]|二十[一二三四五六七八九]|三十[一二三四五六七八九])+[)）].*\n')
    result = re.split(reg, content)
    count = 0
    for i in range(len(result)):
        start_list = []
        record_list = []
        for m in re.finditer(r'\n.*\d+岁\n', result[i]):
            if m.start() > 0:
                start_list.append(m.start())
        for m in range(len(start_list)):
            if m == len(start_list) - 1:
                record_list.append(result[i][start_list[m]:])
            else:
                record_list.append(result[i][start_list[m]:start_list[m+1]])

        disease_name = result[i].split('\n')[0]
        for j in range(len(record_list)):
            # 判断disease_name是否为空
            if disease_name == '':
                disease_name = str(i) + '-未知疾病'
            if len(record_list) == 1:
                file_name = disease_name + '.txt'
            else:
                file_name = disease_name + '-' + str(j+1) + '.txt'
            
            # 写入文件
            with open(os.path.join(ROOT_DIR, TARGET_DIR, file_name), 'w', encoding='utf-8') as f:
                f.write(record_list[j])
                count += 1

    print('医案抽取完成，共抽取{}个医案， 存放在{}目录下'.format(count, os.path.join(ROOT_DIR, TARGET_DIR)))
        

with open(SOURCE_FILE, "r", encoding="utf-8") as f:
    lines = f.readlines()
    content = preprocess(lines) # 预处理
    extract(content) # 抽取医案


医案抽取完成，共抽取234个医案， 存放在Record Collections MFP\Raw Medical Record目录下
