In [13]:
import os
import re

SOURCE_FILE = "中国现代名中医医案精华（一）.txt"

def preprocess(lines):
    '''
    @description: 预处理，删除无用行
    @param {list} lines: 每一行的内容
    @return {string}: 预处理后的内容
    '''

    # 移除仅包含数字的一行
    for line in lines:
        reg = re.compile(r'^\d+\n')
        if re.match(reg, line):
            lines.remove(line)
    
    result = "".join(lines)
    result = result.replace(" ", "") # 删除空格
    result = result.replace("　", "") # 删除全角空格
    result = result.replace("\t", "") # 删除制表符

    return result


def extract(content):
    '''
    @description: 抽取医案
    @param {string} content: 预处理后的内容
    @return {None}:
    '''
    ROOT_DIR = 'Record Collections MZY'
    TARGET_DIR = 'Raw Medical Record'
    if not os.path.exists(ROOT_DIR):
        os.mkdir(ROOT_DIR)
    if not os.path.exists(os.path.join(ROOT_DIR, TARGET_DIR)):
        os.mkdir(os.path.join(ROOT_DIR, TARGET_DIR))

    # 清空目标文件夹
    for file in os.listdir(os.path.join(ROOT_DIR, TARGET_DIR)):
        os.remove(os.path.join(ROOT_DIR, TARGET_DIR, file))

    # 按照`一例` `两例`划分医案大类
    # 按照`例一` `例二`划分医案小类
    start_list = []
    record_list = []
    reg = re.compile(r"(?:.*)[一二三四五六七八九]例\n")
    for m in re.finditer(reg, content):
        if m.start() > 0:
            start_list.append(m.start())
    for m in range(len(start_list)):
        if m == len(start_list) - 1:
            record_list.append(content[start_list[m]:])
        else:
            record_list.append(content[start_list[m]:start_list[m+1]])

    count = 0
    for i in range(len(record_list)):
        first_line = record_list[i].split("\n")[0]
        # 删除`一例` `两例`，以获取疾病名称
        end_reg = re.compile(r"(?:,*)[一二三四五六七八九]例\n")
        disease_name = re.sub(end_reg, "", first_line)
        if disease_name.find('治愈') != -1:
            disease_name = disease_name[disease_name.find('治愈')+2:]
        elif disease_name.find('治疗') != -1:
            disease_name = disease_name[disease_name.find('治疗')+2:]
        elif disease_name.find('治') != -1:
            disease_name = disease_name[disease_name.find('治')+1:]

        if disease_name.find('例') != -1:
            disease_name = disease_name[:disease_name.find('例')-1]

        # 按照`例一` `例二`划分医案小类
        start_list_record = []
        record_list_record = []
        reg_record = re.compile(r"例[一二三四五六七八九].*\n")
        for m in re.finditer(reg_record, record_list[i]):
            if m.start() > 0:
                start_list_record.append(m.start())
        for m in range(len(start_list_record)):
            if m == len(start_list_record) - 1:
                record_list_record.append(record_list[i][start_list_record[m]:])
            else:
                record_list_record.append(record_list[i][start_list_record[m]:start_list_record[m+1]])
        
        for j in range(len(record_list_record)):
            file_name = disease_name
            if len(record_list_record) > 1:
                file_name += "-" + str(j+1) + ".txt"

            with open(os.path.join(ROOT_DIR, TARGET_DIR, file_name), "w", encoding="utf-8") as f:
                f.write(record_list_record[j])
                count += 1

    print('医案抽取完成，共抽取{}个医案， 存放在{}目录下'.format(count, os.path.join(ROOT_DIR, TARGET_DIR)))


with open(SOURCE_FILE, "r", encoding="utf-8") as f:
    lines = f.readlines()
    content = preprocess(lines) # 预处理
    extract(content) # 抽取医案

医案抽取完成，共抽取194个医案， 存放在Record Collections MZY\Raw Medical Record目录下
