In [8]:
import os
import re

SOURCE_FILE = "新冠肺炎医案百例.txt"

def preprocess(lines):
    '''
    @description: 预处理，删除无用行
    @param {list} lines: 每一行的内容
    @return {string}: 预处理后的内容
    '''

    # 移除仅包含数字的一行
    for line in lines:
        reg = re.compile(r'^\d+\n')
        if re.match(reg, line):
            lines.remove(line)
    
    result = "".join(lines)
    result = result.replace(" ", "") # 删除空格
    result = result.replace("　", "") # 删除全角空格
    result = result.replace("\t", "") # 删除制表符
    result = result.replace("新冠肺炎医案百例", "") # 删除无效信息

    return result


def extract(content):
    '''
    @description: 抽取医案
    @param {string} content: 预处理后的内容
    @return {None}:
    '''
    ROOT_DIR = 'Record Collections XG'
    TARGET_DIR = 'Raw Medical Record'
    if not os.path.exists(ROOT_DIR):
        os.mkdir(ROOT_DIR)
    if not os.path.exists(os.path.join(ROOT_DIR, TARGET_DIR)):
        os.mkdir(os.path.join(ROOT_DIR, TARGET_DIR))

    # 清空目标文件夹
    for file in os.listdir(os.path.join(ROOT_DIR, TARGET_DIR)):
        os.remove(os.path.join(ROOT_DIR, TARGET_DIR, file))

    # 按照`病案1`分割
    reg = re.compile(r'\n病案\d+\n')
    start_list = []
    record_list = []
    for match in re.finditer(reg, content):
        start_list.append(match.start())
    for i in range(len(start_list)):
        if i == 0:
            record_list.append(content[:start_list[i]])
            
        if i == len(start_list) - 1:
            record_list.append(content[start_list[i]:])
        else:
            record_list.append(content[start_list[i]:start_list[i+1]])
    count = 0
    for record in record_list:
        count += 1
        with open(os.path.join(ROOT_DIR, TARGET_DIR, '病案{}.txt'.format(count)), "w", encoding="utf-8") as f:
            f.write(record)
    # records = re.split(reg, content)
    # count = 0
    # for record in records:
    #     if record != '':
    #         count += 1
    #         with open(os.path.join(ROOT_DIR, TARGET_DIR, '病案{}.txt'.format(count)), "w", encoding="utf-8") as f:
    #             f.write(record)

    print('医案抽取完成，共抽取{}个医案， 存放在{}目录下'.format(count, os.path.join(ROOT_DIR, TARGET_DIR)))


with open(SOURCE_FILE, "r", encoding="utf-8") as f:
    lines = f.readlines()
    content = preprocess(lines) # 预处理
    extract(content) # 抽取医案

医案抽取完成，共抽取101个医案， 存放在Record Collections XG\Raw Medical Record目录下
