In [1]:
import os
import re

def tag_record_generate(origin_content, key, import_directory_name, filename, book):
    '''
    @description: 生成带有标签的医案(用于批量导入带数据库中)
    @param origin_content: 原始医案内容
    @param key: 疾病名称
    @param import_directory_name: 导入的医案文件夹
    @param filename: 导入的医案文件名
    '''
    
    tag_record_name = os.path.join(import_directory_name, author_name + '-' + filename)
    # 医案来源
    import_content = "#book\n"
    import_content += book + '\n'

    # 医案标签
    import_content += "#start\n"

    dia_idx = origin_content.find('辨证：')
    if dia_idx != -1:
        end_idx = origin_content.find('\n', dia_idx)
        dia_str = origin_content[dia_idx+3:end_idx]
        dia_tag = dia_str.replace('。','')
        dia_tag_list = dia_tag.split('，')
        for tag in dia_tag_list:
            import_content = import_content + 'dia-' + tag + '\n'

    cure_idx = origin_content.find('治法：')
    if cure_idx != -1:
        end_idx = origin_content.find('\n', cure_idx)
        cure_str = origin_content[cure_idx+3:end_idx]
        cure_tag = cure_str.replace('。','')
        cure_tag_list = cure_tag.split('，')
        for tag in cure_tag_list:
            import_content = import_content + 'cure-' + tag + '\n'

    
    import_content = import_content + 'disease-' + key + '\n'
    import_content = import_content + '#end\n'
    import_content = import_content + origin_content
    fp = open(tag_record_name, mode='w', encoding='utf-8')
    fp.write(import_content)
    fp.close()

root_directory = 'Record Collections'
directory_name = 'Raw Medical Record'
import_directory_name = 'Import Medical Record'
file_name = 'record.txt'

fp = open(file_name, 'r', encoding='utf-8')
whole_record_start = False
whole_record = ''
whole_record_map = {}
whole_record_name = ''

for line in fp.readlines():
    # 判断是否以'案'结尾
    length = len(line)
    if length > 1 and line[length-2] == '案':

        if '◎' in line:
            continue
        if whole_record_start == False:
            whole_record_start = True
            whole_record_name = line.replace('\n', '')
        else:
            # 存储上一个whole_record
            whole_record_map[whole_record_name] = whole_record
            whole_record_name = line.replace('\n', '')
            whole_record = ''
            
    # 删除不必要信息
    if '一名真正的名中医' in line:
        continue
    elif '熊继柏临证医案实录' in line:
        continue
    else:
        whole_record += line

# 匹配'案一'、'案二'等信息
record_pattern = r'案[\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]+'
author_name = '熊继柏'

record_cnt = 0
for key in whole_record_map.keys():
    files = re.split(record_pattern, whole_record_map[key])
    if len(files) == 1: # 一个案例
        record_name = os.path.join(directory_name, key+'.txt')
        fp = open(record_name, mode='w', encoding='utf-8')
        fp.write(whole_record_map[key])
        fp.close()

        # tag_record_generate(whole_record_map[key], key, import_directory_name, key)
        record_cnt += 1

    elif len(files) > 1: # 多个案例
        # 创建文件夹
        dir_path = os.path.join(directory_name, key)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        for i in range(1, len(files)):
            record_name = os.path.join(dir_path, key + '-' + str(i) + '.txt')
            fp = open(record_name, mode='w', encoding='utf-8')
            fp.write(files[i])
            fp.close()

            # tag_record_generate(files[i], key, import_directory_name, key+'-'+str(i))
        record_cnt += len(files)

# final文件夹下存放修改好的原始医案
final_medical_dir = 'Refined Medical Record'
# 读取final文件夹下的内容
final_medical_files = os.listdir(os.path.join(root_directory,final_medical_dir))
for file in final_medical_files:
    # 判断是否有子文件夹
    if os.path.isdir(os.path.join(root_directory,final_medical_dir, file)):
        # 读取子文件夹下的内容
        sub_files = os.listdir(os.path.join(root_directory,final_medical_dir, file))
        for sub_file in sub_files:
            # 打开文件
            fp = open(os.path.join(root_directory,final_medical_dir, file, sub_file), mode='r', encoding='utf-8')
            content = fp.read()
            fp.close()
            # 生成带有标签的医案
            key = file.replace('.txt', '')
            tag_record_generate(content, key, os.path.join(root_directory, import_directory_name), sub_file, '熊继柏临证医案实录1')
    else:
        # 打开文件
        fp = open(os.path.join(root_directory,final_medical_dir, file), mode='r', encoding='utf-8')
        content = fp.read()
        fp.close()
        # 生成带有标签的医案
        key = file.replace('.txt', '')
        tag_record_generate(content, key, os.path.join(root_directory, import_directory_name), file, '熊继柏临证医案实录1')

print('生成医案文件成功,共生成{}份医案,存储在{}目录下!'.format(record_cnt, os.path.join(root_directory, import_directory_name)))


生成医案文件成功,共生成362份医案,存储在Record Collections\Import Medical Record目录下!


In [16]:
import pandas as pd

# 生成csv文件
# 第一列为医案内容
# 后面的所有列为标签
# 一行为一个医案
# 一列为一个标签
def generate_csv(root_directory, source_dir, csv_name):
    # 使用pd.DataFrame存储数据
    data = pd.DataFrame(columns=['text', 'label'])

    # 读取文件夹下的所有文件
    files = os.listdir(os.path.join(root_directory, source_dir))
    for file in files:
        if os.path.isdir(os.path.join(root_directory, source_dir, file)):
            sub_files = os.listdir(os.path.join(root_directory, source_dir, file))
            for sub_file in sub_files:
                # 打开文件
                fp = open(os.path.join(root_directory, source_dir, file, sub_file), mode='r', encoding='utf-8')
                content = fp.read()
                fp.close()

                # 读取文件中的标签, 从#start到#end之间的内容
                start = content.find('#start')+6
                end = content.find('#end')
                tags = content[start:end].split(' ')
                text = content[end+4:]
                
                # 将数据存储到data中
                data = pd.concat([data, pd.DataFrame({'text': [text], 'label': [tags]})], ignore_index=True)
        else:
            # 打开文件
            fp = open(os.path.join(root_directory, source_dir, file), mode='r', encoding='utf-8')
            content = fp.read()
            fp.close()

            # 读取文件中的标签, 从#start到#end之间的内容
            start = content.find('#start')+7 # 忽略第一个换行符
            end = content.find('#end')
            tags = content[start:end].split(' ')
            text = content[end+4:]

            # 将数据存储到data中
            data = pd.concat([data, pd.DataFrame({'text': [text], 'label': tags})], ignore_index=True)
        
    # 保存为csv文件
    data.to_csv(os.path.join(root_directory, csv_name), index=False)
    print('已成功生成csv文件，路径为{}'.format(os.path.join(root_directory, csv_name)))

generate_csv(root_directory, import_directory_name, 'medical_record.csv')
            



已成功生成csv文件，路径为Record Collections\medical_record.csv
