## 1. 创建 病历数据结构

In [2]:
from dataclasses import dataclass, field
from datetime import date
from typing import List, Optional
import os
from tqdm import tqdm
import pandas as pd
import pickle


# 定义 病历信息 的数据结构
@dataclass
class Visit:
    visit_id: str
    admittime: date
    dischtime: date
    deathtime: Optional[date]  # 可能为空
    procedures: List[str] = field(default_factory=list)  # 默认为空列表
    prescriptions: List[str] = field(default_factory=list)
    diagnoses: List[str] = field(default_factory=list)
    labevent: List[str] = field(default_factory=list)  # 具体结构 未定义

@dataclass
class Patient:
    expire_flag: bool
    gender: str 
    visits: List[Visit] = field(default_factory=list)  # 默认为空列表

### 1.1 测试一下数据结构

In [3]:
# 创建一个患者
patient = Patient(expire_flag=False, gender = 'F')

# 添加一次就诊记录
visit1 = Visit(
    visit_id=1,
    admittime=date(2023, 10, 1),
    dischtime=date(2023, 10, 5),
    deathtime=None,
    procedures=["surgery", "x-ray"],
    prescriptions=["aspirin", "antibiotics"],
    diagnoses=["flu", "pneumonia"],
    labevent=["blood_test", "urine_test"]
)

# 将就诊记录添加到患者的 visits 列表中
patient.visits.append(visit1)

# 输出患者信息
print(patient)

Patient(expire_flag=False, gender='F', visits=[Visit(visit_id=1, admittime=datetime.date(2023, 10, 1), dischtime=datetime.date(2023, 10, 5), deathtime=None, procedures=['surgery', 'x-ray'], prescriptions=['aspirin', 'antibiotics'], diagnoses=['flu', 'pneumonia'], labevent=['blood_test', 'urine_test'])])


### 1.2 csv 文件读取函数

In [4]:
def readcsv(input_dir, filename):
    file_path = os.path.join(input_dir, filename)
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"文件 {file_path} 不存在。")
    return pd.read_csv(file_path, dtype=str)  # 显示 规定为 str, 防止 将 0851 解析为 851

### 1.3 保存 & 读取 pkl 文件

In [5]:
def savePkl(outputdir, filename, content):
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)
    filepath = os.path.join(outputdir, f"{filename}")
    
    with open(filepath, 'wb') as f:
        pickle.dump(content, f)
    
    print(f"对象已成功保存到 {filepath}")

In [6]:
def loadPkl(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"文件 {file_path} 不存在！")

    with open(file_path, 'rb') as f:
        content = pickle.load(f)
    
    print(f"已成功加载文件：{file_path}")
    return content

## 2. 读取数据集 （该部分作废）

In [27]:
from pyhealth.datasets import MIMIC3Dataset

dataset = MIMIC3Dataset(
    # 数据集 所在目录
    root="../dataset/MIMICIII_data/", 
    
    tables=["DIAGNOSES_ICD", "PROCEDURES_ICD", "PRESCRIPTIONS"], # "LABEVENTS"  # (基本信息存在：PATIENTS 和 ADMISSIONS 中) 进一步加载的表格
    # code_mapping={ # (可选)
    #     "ICD9CM": "CCSCM",
    #     "ICD9PROC": "CCSPROC",
    #     "NDC": ("ATC", {"target_kwargs": {"level": 3}})
    # },
    dev=False,  # 开发模式，仅使用数据的一小部分
    refresh_cache=True  # 刷新缓存
)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/
finish basic patient information parsing : 35.06629490852356s
finish parsing DIAGNOSES_ICD : 29.78221607208252s
finish parsing PROCEDURES_ICD : 25.489179849624634s
finish parsing PRESCRIPTIONS : 139.89949870109558s


Mapping codes: 100%|██████████| 46520/46520 [00:01<00:00, 24115.37it/s]


### 2.1 设置映射表

In [28]:
from pyhealth.medcode import InnerMap

# # 加载 映射词典
# print(dataset.code_vocs.keys())
# diagnosis_map_table = InnerMap.load(dataset.code_vocs['conditions']) # pyhealth.medcode.codes.icd9cm.ICD9CM
# procedures_map_table = InnerMap.load(dataset.code_vocs['procedures']) # pyhealth.medcode.codes.icd9proc.ICD9PROC
prescriptions_map_table = InnerMap.load(dataset.code_vocs['drugs']) # pyhealth.medcode.codes.ndc.NDC

### 2.2 记录保留字段
patients_field = ['SUBJECT_ID', 'EXPIRE_FLAG']

admission_field = ['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME']  # 其中 Deathtime 可能为 空值

procedures_field = ['SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE']

prescriptions_field = ['SUBJECT_ID', 'HADM_ID', 'NDC']

diagnoses_field = []

labevents_field = []

## 3. 数据读取和存储

In [6]:
ehrBase = {}  # 初始化 字典，key 为 病人唯一 id, value 为

### 3.1 病人表

读取病人表

In [8]:
table = readcsv('../dataset/MIMICIII_data/', 'PATIENTS.csv')
fliter = ['SUBJECT_ID', 'EXPIRE_FLAG', 'GENDER']
table = table[fliter]
table.head()

Unnamed: 0,SUBJECT_ID,EXPIRE_FLAG,GENDER
0,249,0,F
1,250,1,F
2,251,0,M
3,252,0,M
4,253,0,F


提取病人信息

In [9]:
# 遍历 DataFrame 的每一行
for index, row in table.iterrows():
    patient_id = row['SUBJECT_ID']
    expire_flag = row['EXPIRE_FLAG']
    gender = row['GENDER']
    
    patient = Patient(expire_flag=expire_flag, gender=gender)
    
    ehrBase[str(patient_id)] = patient

print('单个病人ehr数据示例:', ehrBase['249'] ,'长度：', len(ehrBase))

单个病人ehr数据示例: Patient(expire_flag='0', gender='F', visits=[]) 长度： 46520


阶段性文件存储

In [None]:
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v1.pkl', content=ehrBase)  # 版本一：存储病人基本信息

ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v1.pkl")
print('字典元素示例', ehrBase['249'], '长度：', len(ehrBase))

### 3.2 住院登记表

In [12]:
table = readcsv('../dataset/MIMICIII_data/', 'ADMISSIONS.csv')
fliter = ['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME']
table = table[fliter]

# 将时间字段转换为 datetime 类型
table['ADMITTIME'] = pd.to_datetime(table['ADMITTIME'])
table['DISCHTIME'] = pd.to_datetime(table['DISCHTIME'])
table['DEATHTIME'] = pd.to_datetime(table['DEATHTIME'])

# 去重
table = table.drop_duplicates()

table.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME
0,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,NaT
1,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,NaT
2,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,NaT
3,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,NaT
4,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,NaT


In [13]:
# 遍历 ADMISSIONS 表格
visit_num = 0  # 记录 visit 数量
for index, row in table.iterrows():
    subject_id = row['SUBJECT_ID']
    hadm_id = row['HADM_ID']
    admittime = row['ADMITTIME']
    dischtime = row['DISCHTIME']
    deathtime = row['DEATHTIME']
    
    # 创建 Visit 对象
    visit = Visit(
        visit_id=hadm_id,
        admittime=admittime,
        dischtime=dischtime,
        deathtime=deathtime,
    )
    
    ehrBase[str(subject_id)].visits.append(visit)
    visit_num += 1

print(visit_num)

58976


In [14]:
print('列表元素示例', ehrBase['249'], '长度：', len(ehrBase))

列表元素示例 Patient(expire_flag='0', gender='F', visits=[Visit(visit_id='116935', admittime=Timestamp('2149-12-17 20:41:00'), dischtime=Timestamp('2149-12-31 14:55:00'), deathtime=NaT, procedures=[], prescriptions=[], diagnoses=[], labevent=[]), Visit(visit_id='149546', admittime=Timestamp('2155-02-03 20:16:00'), dischtime=Timestamp('2155-02-14 11:15:00'), deathtime=NaT, procedures=[], prescriptions=[], diagnoses=[], labevent=[]), Visit(visit_id='158975', admittime=Timestamp('2156-04-27 15:33:00'), dischtime=Timestamp('2156-05-14 15:30:00'), deathtime=NaT, procedures=[], prescriptions=[], diagnoses=[], labevent=[])]) 长度： 46520


In [None]:
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v2.pkl', content=ehrBase)  # 版本二：增加病人 住院登记信息

ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v2.pkl")
print('字典元素示例', ehrBase['249'], '长度：', len(ehrBase))

### 3.3 手术

构造映射字典

In [17]:
# 读取 csv 映射表来构造 映射字典 （因为 lookup 函数在查 3601 的时候 内部总是 改写为 36.01）
mapper = readcsv('../dataset/MIMICIII_data/', 'D_ICD_PROCEDURES.csv')[['ICD9_CODE', 'LONG_TITLE']]

mid = dict()
for index, row in mapper.iterrows():
    mid[str(row['ICD9_CODE'])] = str(row['LONG_TITLE'])

mapper = mid
len(mapper)

3882

读取数据，进行映射

In [18]:
from collections import Counter

# 初始化计数器
missing_codes = Counter() # 记录一下 没有映射成功的字段

table = readcsv('../dataset/MIMICIII_data/', 'PROCEDURES_ICD.csv')
fliter = ['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']
table = table[fliter]

# 去重
table = table.drop_duplicates()

# 对该列 每个元素 进行 medCode 到 自然语言 转换
for index, row in table.iterrows():
    code = str(row['ICD9_CODE'])
    if code in mapper:
        table.at[index, 'ICD9_CODE'] = mapper[code]
    else:
        # 如果代码不在映射字典中，保持原样并记录
        missing_codes[code] += 1

table.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,62641,154460,Insertion of intercostal catheter for drainage
1,2592,130856,Continuous invasive mechanical ventilation for...
2,2592,130856,"Venous catheterization, not elsewhere classified"
3,55357,119355,Continuous invasive mechanical ventilation for...
4,55357,119355,Spinal tap


未映射成功的部分：

Counter({'3601': 1046,
         '022': 455,
         '3605': 218,
         '324': 85,
         '458': 57,
         '684': 31,
         '857': 29,
         '537': 24,
         '325': 22,
         '323': 17,
         '3602': 16,
         '374': 15,
         '485': 13,
         '686': 9,
         '683': 5,
         '398': 1})

进行 ehrBase 数据填充

In [20]:
ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v2.pkl")

# 遍历 table 中的每一行数据
for index, row in table.iterrows():
    subject_id = row['SUBJECT_ID']
    hadm_id = row['HADM_ID']
    icd9_code = row['ICD9_CODE']
    
    if str(subject_id) in ehrBase:
        patient = ehrBase[subject_id]
        
        #print('id 匹配')
        for visit in patient.visits:
            #print('visit 匹配', type(visit.visit_id))
            if str(visit.visit_id) == hadm_id:
                #print('visit_id 匹配')
                visit.procedures.append(icd9_code)
                ehrBase[subject_id] = patient
                break
    
ehrBase['249']

已成功加载文件：../dataset_processed/MIMICIII_data/ehrBase_v2.pkl


Patient(expire_flag='0', gender='F', visits=[Visit(visit_id='116935', admittime=Timestamp('2149-12-17 20:41:00'), dischtime=Timestamp('2149-12-31 14:55:00'), deathtime=NaT, procedures=['Continuous invasive mechanical ventilation for 96 consecutive hours or more', 'Left heart cardiac catheterization', 'Coronary arteriography using two catheters', 'Other and unspecified coronary arteriography', 'Venous catheterization, not elsewhere classified', 'Insertion of endotracheal tube', 'Transfusion of packed cells', 'Enteral infusion of concentrated nutritional substances', 'Transfusion of other serum'], prescriptions=[], diagnoses=[], labevent=[]), Visit(visit_id='149546', admittime=Timestamp('2155-02-03 20:16:00'), dischtime=Timestamp('2155-02-14 11:15:00'), deathtime=NaT, procedures=['Endovascular removal of obstruction from head and neck vessel(s)', 'Procedure on single vessel', 'Arteriography of cerebral arteries', 'Injection or infusion of thrombolytic agent'], prescriptions=[], diagnoses

In [None]:
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v3.pkl', content=ehrBase)  # 版本三：增加病人 手术信息

ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v3.pkl")
print('字典元素示例', ehrBase['249'], '长度：', len(ehrBase))

### 3.4 诊断

构造映射字典

In [58]:
# 读取 csv 映射表来构造 映射字典 （因为 lookup 函数在查 3601 的时候 内部总是 改写为 36.01）
mapper = readcsv('../dataset/MIMICIII_data/', 'D_ICD_DIAGNOSES.csv')[['ICD9_CODE', 'LONG_TITLE']]

mid = dict()
for index, row in mapper.iterrows():
    mid[str(row['ICD9_CODE'])] = str(row['LONG_TITLE'])

mapper = mid
len(mapper)

14567

读取数据 进行映射

In [59]:
from collections import Counter

# 初始化计数器
missing_codes = Counter() # 记录一下 没有映射成功的字段

table = readcsv('../dataset/MIMICIII_data/', 'DIAGNOSES_ICD.csv')
fliter = ['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']
table = table[fliter]

# 去重
table = table.drop_duplicates()

# 对该列 每个元素  进行 medCode 到 自然语言 转换
for index, row in table.iterrows():
    code = str(row['ICD9_CODE'])
    if code in mapper:
        table.at[index, 'ICD9_CODE'] = mapper[code]
    else:
        # 如果代码不在映射字典中，保持原样并记录
        missing_codes[code] += 1

table.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,109,172335,"Hypertensive chronic kidney disease, malignant..."
1,109,172335,"Pneumonia, organism unspecified"
2,109,172335,Chronic glomerulonephritis in diseases classif...
3,109,172335,"Chronic kidney disease, Stage V"
4,109,172335,Other primary cardiomyopathies


未映射成功的部分：

Counter({'5185': 1807,
         '2765': 1348,
         '7793': 1068,
         '0414': 967,
         '9974': 760,
         '4538': 607,
         '2874': 584,
         '2766': 575,
         '7895': 534,
         '5997': 509,
         '2841': 472,
         '7806': 426,
         '7863': 401,
         '9973': 390,
         'V721': 374,
         '2848': 321,
         '2554': 290,
         '585': 273,
         '7070': 271,
         '5191': 236,
         '7708': 225,
         '4582': 218,
         '5672': 217,
         '4251': 183,
         'V451': 175,
...
         '6168': 1,
         '6221': 1,
         '3234': 1,
         '7523': 1,
         '9994': 1})

进行 ehrBase 数据填充

In [60]:
from tqdm import tqdm
ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v3.pkl")

# 遍历 table 中的每一行数据
for index, row in tqdm(table.iterrows(), total=len(table), desc="Processing rows"):
    subject_id = row['SUBJECT_ID']
    hadm_id = row['HADM_ID']
    icd9_code = row['ICD9_CODE']
    
    if str(subject_id) in ehrBase:
        patient = ehrBase[subject_id]
        
        #print('id 匹配')
        for visit in patient.visits:
            #print('visit 匹配', type(visit.visit_id))
            if str(visit.visit_id) == hadm_id:
                #print('visit_id 匹配')
                visit.diagnoses.append(icd9_code)
                ehrBase[subject_id] = patient
                break

ehrBase['109']

已成功加载文件：../dataset_processed/MIMICIII_data/ehrBase_v3.pkl


Processing rows: 100%|██████████| 650987/650987 [00:28<00:00, 22504.71it/s]


Patient(expire_flag='1', gender='F', visits=[Visit(visit_id='183350', admittime=Timestamp('2137-11-04 19:36:00'), dischtime=Timestamp('2137-11-21 18:13:00'), deathtime=NaT, procedures=['Closed [percutaneous] [needle] biopsy of kidney', 'Therapeutic plasmapheresis', 'Venous catheterization, not elsewhere classified', 'Transfusion of packed cells', 'Other endoscopy of small intestine', 'Venous catheterization for renal dialysis', 'Hemodialysis'], prescriptions=[], diagnoses=['Systemic lupus erythematosus', 'Acute kidney failure, unspecified', 'Hematemesis', 'Hemorrhage complicating a procedure', 'Thrombotic microangiopathy', 'Hypertensive chronic kidney disease, malignant, with chronic kidney disease stage V or end stage renal disease', 'Chronic glomerulonephritis in diseases classified elsewhere', 'Other specified procedures as the cause of abnormal reaction of patient, or of later complication, without mention of misadventure at time of procedure', 'Other specified disorders resulting 

In [None]:
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v4.pkl', content=ehrBase)  # 版本四：增加病人 诊断信息

ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v4.pkl")
print('字典元素示例', ehrBase['249'], '长度：', len(ehrBase))

### 3.5 药物

构造映射字典

In [66]:
# 读取 csv 映射表来构造 映射字典 （因为 lookup 函数在查 3601 的时候 内部总是 改写为 36.01）
mapper = InnerMap.load(dataset.code_vocs['drugs'])

读取数据 进行映射

In [67]:
from collections import Counter

# 初始化计数器
missing_codes = Counter() # 记录一下 没有映射成功的字段

table = readcsv('../dataset/MIMICIII_data/', 'PRESCRIPTIONS.csv')
fliter = ['SUBJECT_ID', 'HADM_ID', 'NDC']
table = table[fliter]

# 去重
table = table.drop_duplicates()

# 对该列 每个元素  进行 medCode 到 自然语言 转换
for index, row in tqdm(table.iterrows(), total=len(table), desc="Task processing:"):
    code = str(row['NDC'])
    if code in mapper:
        table.at[index, 'NDC'] = mapper.lookup(code)
    else:
        # 如果代码不在映射字典中，保持原样并记录
        missing_codes[code] += 1

table.head()

Task processing:: 100%|██████████| 1979323/1979323 [02:03<00:00, 15962.30it/s]


Unnamed: 0,SUBJECT_ID,HADM_ID,NDC
0,6,107064,tacrolimus 1 MG Oral Capsule [Prograf]
1,6,107064,warfarin sodium 5 MG Oral Tablet [Coumadin]
2,6,107064,"250 ML heparin sodium, porcine 100 UNT/ML Inje..."
3,6,107064,0
4,6,107064,furosemide 20 MG Oral Tablet


未映射成功的部分：

Counter({'0': 48985,
         '17314931102': 5137,
         '66689036430': 4476,
         '62584078833': 2397,
         '66591018442': 2134,
         '62174057751': 2040,
         '55499120401': 1913,
         '00074729501': 1877,
         'nan': 1613,
         '00245008201': 1186,
         '16837085839': 1182,
         '15127020017': 1167,
         '11980002515': 980,
         '37205014472': 964,
         '00087036503': 541,
         '00338040360': 515,
         '87701071218': 448,
         '00338101102': 432,
         '00065041435': 340,
         '00074131201': 307,
         '00245009101': 276,
         '00436515110': 184,
         '00074010304': 141,
         '54569475100': 132,
         '47682021701': 96,
...
         '00026066420': 1,
         '00026066430': 1,
         '72140085731': 1,
         '49281000610': 1,
         '00054348254': 1})

进行 ehrBase 数据填充

In [None]:
ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v4.pkl")

# 遍历 table 中的每一行数据
for index, row in tqdm(table.iterrows(), total=len(table), desc="Processing rows"):
    subject_id = row['SUBJECT_ID']
    hadm_id = row['HADM_ID']
    ndc_code = row['NDC']
    
    if str(subject_id) in ehrBase:
        patient = ehrBase[subject_id]
        
        #print('id 匹配')
        for visit in patient.visits:
            #print('visit 匹配', type(visit.visit_id))
            if str(visit.visit_id) == hadm_id:
                #print('visit_id 匹配')
                visit.prescriptions.append(ndc_code)
                ehrBase[subject_id] = patient
                break

已成功加载文件：../dataset_processed/MIMICIII_data/ehrBase_v4.pkl


Processing rows: 100%|██████████| 1979323/1979323 [01:28<00:00, 22398.88it/s]


AttributeError: 'Patient' object has no attribute 'visit'

In [70]:
ehrBase['249'].visits[0].prescriptions

['lorazepam 0.5 MG Oral Tablet',
 'lorazepam 1 MG Oral Tablet',
 'trandolapril 1 MG Oral Tablet [Mavik]',
 '0',
 '250 ML heparin sodium, porcine 100 UNT/ML Injection',
 'Vancomycin 100 MG/ML Injectable Solution [Vancocin]',
 'Captopril 12.5 MG Oral Tablet [Capoten]',
 'Nitroglycerin 0.02 MG/MG Topical Ointment [Nitro-Bid]',
 'Prednisone 20 MG Oral Tablet',
 '17314931102',
 'Levofloxacin 250 MG Oral Tablet [Levaquin]',
 'Morphine Sulfate 8 MG/ML Injectable Solution',
 'Furosemide 10 MG/ML Injectable Solution',
 '100 ML sodium chloride 9 MG/ML Injection',
 'Vitamin K 1 10 MG/ML Injectable Solution',
 'insulin, regular, human 100 UNT/ML Injectable Solution [Humulin R]',
 'Captopril 25 MG Oral Tablet',
 'Nitroglycerin 0.4 MG/ML Injectable Solution',
 '50 ML potassium chloride 0.4 MEQ/ML Injection',
 'racepinephrine 22.5 MG/ML Inhalation Solution [S-2]',
 '1 ML hydralazine hydrochloride 20 MG/ML Injection',
 'metoprolol tartrate 25 MG Oral Tablet',
 'heparin sodium, porcine 5000 UNT/ML Inje

In [None]:
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v5.pkl', content=ehrBase)  # 版本五：增加病人 用药信息

ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v5.pkl")
print('字典元素示例', ehrBase['249'], '长度：', len(ehrBase))

### 3.6 实验室检查数据

构造映射字典

In [7]:
# 读取 csv 映射表来构造 映射字典 （因为 lookup 函数在查 3601 的时候 内部总是 改写为 36.01）
mapper = readcsv('../dataset/MIMICIII_data/', 'D_LABITEMS.csv')[['ITEMID', 'LABEL', 'FLUID']]  # 测量的概念，测量的物质

mid = dict()
for index, row in mapper.iterrows():
    mid[str(row['ITEMID'])] = [str(row['LABEL']), str(row['FLUID'])]  # 第一个表示 测量的概念，第二个元素表示 测量的物质

mapper = mid
len(mapper)

753

读取数据 进行映射

In [8]:
from collections import Counter

# 初始化计数器
missing_codes = Counter() # 记录一下 没有映射成功的字段

table = readcsv('../dataset/', 'LABEVENTS_WITH_STATUS.csv')
fliter = ['SUBJECT_ID', 'HADM_ID', 'ITEMID', 'CHARTTIME', 'VALUE', 'VALUEUOM', 'FLAG', 'RESULT_STATUS']  # 包括：检查项目，检查时间(暂时忽略), 检查值，单位，是否正常
table = table[fliter]

# 去重
table = table.drop_duplicates()
print('读取 和 去重完毕')

# 拼接 VALUE 和 VALUEUOM 字段，并将结果存入 VALUE 字段
table['VALUE'] = table.apply(lambda row: f"{row['VALUE']} {row['VALUEUOM']}" if pd.notna(row['VALUEUOM']) else row['VALUE'], axis=1)
print('拼接完毕')

# 清空 VALUEUOM 字段的所有内容，设置为 NaN
table['VALUEUOM'] = pd.NA
print('清空完毕')

# 将 VALUEUOM 字段改名
table.rename(columns={'VALUEUOM': 'ITEMCONTENT'}, inplace=True)


读取 和 去重完毕
拼接完毕
清空完毕


In [9]:
table.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,ITEMCONTENT,FLAG,RESULT_STATUS
0,3,,50820,2101-10-12 16:07:00,7.39 units,,,Normal
1,3,,50800,2101-10-12 18:17:00,ART,,,Unknown
2,3,,50802,2101-10-12 18:17:00,-1 mEq/L,,,Normal
3,3,,50804,2101-10-12 18:17:00,22 mEq/L,,,Normal
4,3,,50808,2101-10-12 18:17:00,0.93 mmol/L,,abnormal,Normal


In [21]:
print(table['HADM_ID'].dtype)

object


In [10]:
# 对该列 每个元素  进行 medCode 到 自然语言 转换
for index, row in tqdm(table.iterrows(), total=len(table), desc="Task processing:"):
    code = str(row['ITEMID'])
    if code in mapper:
        table.at[index, 'ITEMID'] = mapper[code][0]
        table.at[index, 'ITEMCONTENT'] = mapper[code][1]
    else:
        # 如果代码不在映射字典中，保持原样并记录
        missing_codes[code] += 1

Task processing:: 100%|██████████| 27851392/27851392 [35:17<00:00, 13155.69it/s]


In [20]:
table.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,ITEMCONTENT,FLAG,RESULT_STATUS
160,3,145834.0,Anion Gap,2101-10-20 16:40:00,17 mEq/L,Blood,,Normal
161,3,145834.0,Bicarbonate,2101-10-20 16:40:00,25 mEq/L,Blood,,Normal
162,3,145834.0,"Calcium, Total",2101-10-20 16:40:00,8.2 mg/dL,Blood,abnormal,Normal
163,3,145834.0,Chloride,2101-10-20 16:40:00,99 mEq/L,Blood,abnormal,Normal
164,3,145834.0,Creatine Kinase (CK),2101-10-20 16:40:00,48 IU/L,Blood,,Normal


In [19]:
# 删除 HADM_ID 为空的行
# table = table.dropna(subset=['HADM_ID'])
table.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,ITEMCONTENT,FLAG,RESULT_STATUS
160,3,145834.0,Anion Gap,2101-10-20 16:40:00,17 mEq/L,Blood,,Normal
161,3,145834.0,Bicarbonate,2101-10-20 16:40:00,25 mEq/L,Blood,,Normal
162,3,145834.0,"Calcium, Total",2101-10-20 16:40:00,8.2 mg/dL,Blood,abnormal,Normal
163,3,145834.0,Chloride,2101-10-20 16:40:00,99 mEq/L,Blood,abnormal,Normal
164,3,145834.0,Creatine Kinase (CK),2101-10-20 16:40:00,48 IU/L,Blood,,Normal


未映射成功的部分：

Counter({'pH': 5,
         'SPECIMEN TYPE': 1,
         'Base Excess': 1,
         'Calculated Total CO2': 1,
         'Free Calcium': 1,
         'Intubated': 1})

进行 ehrBase 数据填充

In [30]:
from tqdm import tqdm
ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v5.pkl")

n = 0

# 遍历 table 中的每一行数据
for index, row in tqdm(table.iterrows(), total=len(table), desc="Processing rows"):
    subject_id = row['SUBJECT_ID']
    hadm_id = row['HADM_ID']
    item = row['ITEMID']
    value = row['VALUE']
    flag = row['FLAG']
    itemcontent = row['ITEMCONTENT']  # 测量物质
    box = row['RESULT_STATUS']  # 分箱结果
    
    record = {
    'item': item,
    'value': value,
    'flag': flag,
    'itemcontent': itemcontent,
    'box': box
}

    if str(subject_id) in ehrBase:
        patient = ehrBase[subject_id]
        
        #print('id 匹配')
        for visit in patient.visits:
            #print('visit 匹配', type(hadm_id))
            #print(visit.visit_id.rstrip('.0'), hadm_id)
            if visit.visit_id.rstrip('.0') == hadm_id.rstrip('.0'):
                #print('visit_id 匹配')
                visit.labevent.append(record)
                ehrBase[subject_id] = patient

                n += 1
                break
    
print(n)
            

已成功加载文件：../dataset_processed/MIMICIII_data/ehrBase_v5.pkl


Processing rows: 100%|██████████| 22245034/22245034 [20:24<00:00, 18159.91it/s]

22245034





In [32]:
ehrBase['249'].visits[1]

Visit(visit_id='149546', admittime=Timestamp('2155-02-03 20:16:00'), dischtime=Timestamp('2155-02-14 11:15:00'), deathtime=NaT, procedures=['Endovascular removal of obstruction from head and neck vessel(s)', 'Procedure on single vessel', 'Arteriography of cerebral arteries', 'Injection or infusion of thrombolytic agent'], prescriptions=['1000 ML sodium chloride 9 MG/ML Injection', 'atorvastatin 40 MG Oral Tablet [Lipitor]', 'NITROFURANTOIN, MACROCRYSTALS 25 MG / Nitrofurantoin, Monohydrate 75 MG Oral Capsule [Macrobid]', 'pantoprazole 40 MG Injection [Protonix]', '0', '50 ML glucose 500 MG/ML Prefilled Syringe', 'glucagon (rDNA) 1 MG Injection [GlucaGen]', 'TAB-A-VITE TABLET', 'Nitroglycerin 0.3 MG Sublingual Tablet [Nitroquick]', 'levothyroxine sodium 0.075 MG Oral Tablet [Synthroid]', '1 ML morphine sulfate 4 MG/ML Injection', '1 ML morphine sulfate 2 MG/ML Prefilled Syringe', 'ipratropium bromide 0.2 MG/ML Inhalation Solution', '120 ACTUAT fluticasone propionate 0.11 MG/ACTUAT Meter

## 4. 将 ehrBase 保存一波

### 4.1 版本1

In [10]:
# 运行完 3.2 进行一次保存
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v1.pkl', content=ehrBase)  # 版本一：包含 病人 id, 病人 住院号, 性别 等基本信息

对象已成功保存到 ../dataset_processed/MIMICIII_data\ehrBase_v1.pkl


In [11]:
ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v1.pkl")
print('字典元素示例', ehrBase['249'], '长度：', len(ehrBase))

已成功加载文件：../dataset_processed/MIMICIII_data/ehrBase_v1.pkl
字典元素示例 Patient(expire_flag='0', gender='F', visits=[]) 长度： 46520


### 4.2 版本2

In [15]:
# 运行完 3.3 进行一次保存
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v2.pkl', content=ehrBase)  # 版本二：增加病人的 手术信息

对象已成功保存到 ../dataset_processed/MIMICIII_data\ehrBase_v2.pkl


In [16]:
ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v2.pkl")
print('字典元素示例', ehrBase['249'], '长度：', len(ehrBase))

已成功加载文件：../dataset_processed/MIMICIII_data/ehrBase_v2.pkl
字典元素示例 Patient(expire_flag='0', gender='F', visits=[Visit(visit_id='116935', admittime=Timestamp('2149-12-17 20:41:00'), dischtime=Timestamp('2149-12-31 14:55:00'), deathtime=NaT, procedures=[], prescriptions=[], diagnoses=[], labevent=[]), Visit(visit_id='149546', admittime=Timestamp('2155-02-03 20:16:00'), dischtime=Timestamp('2155-02-14 11:15:00'), deathtime=NaT, procedures=[], prescriptions=[], diagnoses=[], labevent=[]), Visit(visit_id='158975', admittime=Timestamp('2156-04-27 15:33:00'), dischtime=Timestamp('2156-05-14 15:30:00'), deathtime=NaT, procedures=[], prescriptions=[], diagnoses=[], labevent=[])]) 长度： 46520


### 4.3 版本3

In [21]:
# 运行完 3.3 进行一次保存
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v3.pkl', content=ehrBase)  # 版本三：增加病人的 诊断信息

对象已成功保存到 ../dataset_processed/MIMICIII_data\ehrBase_v3.pkl


In [53]:
ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v3.pkl")
print('字典元素示例', ehrBase['249'], '长度：', len(ehrBase))

已成功加载文件：../dataset_processed/MIMICIII_data/ehrBase_v3.pkl
字典元素示例 Patient(expire_flag='0', gender='F', visits=[Visit(visit_id='116935', admittime=Timestamp('2149-12-17 20:41:00'), dischtime=Timestamp('2149-12-31 14:55:00'), deathtime=NaT, procedures=['Continuous invasive mechanical ventilation for 96 consecutive hours or more', 'Left heart cardiac catheterization', 'Coronary arteriography using two catheters', 'Other and unspecified coronary arteriography', 'Venous catheterization, not elsewhere classified', 'Insertion of endotracheal tube', 'Transfusion of packed cells', 'Enteral infusion of concentrated nutritional substances', 'Transfusion of other serum'], prescriptions=[], diagnoses=[], labevent=[]), Visit(visit_id='149546', admittime=Timestamp('2155-02-03 20:16:00'), dischtime=Timestamp('2155-02-14 11:15:00'), deathtime=NaT, procedures=['Endovascular removal of obstruction from head and neck vessel(s)', 'Procedure on single vessel', 'Arteriography of cerebral arteries', 'Injection

In [57]:
ehrBase['249'].visits[0].procedures

['Continuous invasive mechanical ventilation for 96 consecutive hours or more',
 'Left heart cardiac catheterization',
 'Coronary arteriography using two catheters',
 'Other and unspecified coronary arteriography',
 'Venous catheterization, not elsewhere classified',
 'Insertion of endotracheal tube',
 'Transfusion of packed cells',
 'Enteral infusion of concentrated nutritional substances',
 'Transfusion of other serum']

### 4.4 版本4

In [71]:
# 运行完 3.5 进行一次保存
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v5.pkl', content=ehrBase)  # 版本四：增加病人的 用药信息


对象已成功保存到 ../dataset_processed/MIMICIII_data\ehrBase_v5.pkl


In [None]:
# 运行完 3.5 进行一次保存
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v5.pkl', content=ehrBase)  # 版本四：增加病人的 用药信息

ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v5.pkl")
print('字典元素示例', ehrBase['249'], '长度：', len(ehrBase))

已成功加载文件：../dataset_processed/MIMICIII_data/ehrBase_v5.pkl
字典元素示例 Patient(expire_flag='0', gender='F', visits=[Visit(visit_id='116935', admittime=Timestamp('2149-12-17 20:41:00'), dischtime=Timestamp('2149-12-31 14:55:00'), deathtime=NaT, procedures=['Continuous invasive mechanical ventilation for 96 consecutive hours or more', 'Left heart cardiac catheterization', 'Coronary arteriography using two catheters', 'Other and unspecified coronary arteriography', 'Venous catheterization, not elsewhere classified', 'Insertion of endotracheal tube', 'Transfusion of packed cells', 'Enteral infusion of concentrated nutritional substances', 'Transfusion of other serum'], prescriptions=['lorazepam 0.5 MG Oral Tablet', 'lorazepam 1 MG Oral Tablet', 'trandolapril 1 MG Oral Tablet [Mavik]', '0', '250 ML heparin sodium, porcine 100 UNT/ML Injection', 'Vancomycin 100 MG/ML Injectable Solution [Vancocin]', 'Captopril 12.5 MG Oral Tablet [Capoten]', 'Nitroglycerin 0.02 MG/MG Topical Ointment [Nitro-Bid]',

In [75]:
ehrBase['249'].visits[0].procedures

['Continuous invasive mechanical ventilation for 96 consecutive hours or more',
 'Left heart cardiac catheterization',
 'Coronary arteriography using two catheters',
 'Other and unspecified coronary arteriography',
 'Venous catheterization, not elsewhere classified',
 'Insertion of endotracheal tube',
 'Transfusion of packed cells',
 'Enteral infusion of concentrated nutritional substances',
 'Transfusion of other serum']

### 4.5 版本5

In [None]:
# 运行 3.6 过程中进行一次保存
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='labevents_withNAN.pkl', content=table)  # 包含 hadm_id NAN值 的实验室检查数据

对象已成功保存到 ../dataset_processed/MIMICIII_data\labevents_withNAN.pkl


In [None]:
# 运行 3.6 过程中进行一次保存
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='labevents_withoutNAN.pkl', content=table)  # 不包含 hadm_id NAN值 的实验室检查数据

对象已成功保存到 ../dataset_processed/MIMICIII_data\labevents_withoutNAN.pkl


In [34]:
# 运行完 3.6 进行一次保存
savePkl(outputdir='../dataset_processed/MIMICIII_data', filename='ehrBase_v6.pkl', content=ehrBase)  # 版本五：增加病人的 实验室检查数据

对象已成功保存到 ../dataset_processed/MIMICIII_data\ehrBase_v6.pkl


In [36]:
ehrBase = loadPkl("../dataset_processed/MIMICIII_data/ehrBase_v6.pkl")
print('字典元素示例', ehrBase['249'], '长度：', len(ehrBase))

已成功加载文件：../dataset_processed/MIMICIII_data/ehrBase_v6.pkl
字典元素示例 Patient(expire_flag='0', gender='F', visits=[Visit(visit_id='116935', admittime=Timestamp('2149-12-17 20:41:00'), dischtime=Timestamp('2149-12-31 14:55:00'), deathtime=NaT, procedures=['Continuous invasive mechanical ventilation for 96 consecutive hours or more', 'Left heart cardiac catheterization', 'Coronary arteriography using two catheters', 'Other and unspecified coronary arteriography', 'Venous catheterization, not elsewhere classified', 'Insertion of endotracheal tube', 'Transfusion of packed cells', 'Enteral infusion of concentrated nutritional substances', 'Transfusion of other serum'], prescriptions=['lorazepam 0.5 MG Oral Tablet', 'lorazepam 1 MG Oral Tablet', 'trandolapril 1 MG Oral Tablet [Mavik]', '0', '250 ML heparin sodium, porcine 100 UNT/ML Injection', 'Vancomycin 100 MG/ML Injectable Solution [Vancocin]', 'Captopril 12.5 MG Oral Tablet [Capoten]', 'Nitroglycerin 0.02 MG/MG Topical Ointment [Nitro-Bid]',

## 5. ehrBase 数据示例

In [34]:
# 查看 单个病人的 ehr 数据
ehrBase['258']

Patient(expire_flag='0', gender='F', visits=[Visit(visit_id='189406', admittime=Timestamp('2124-09-19 03:59:00'), dischtime=Timestamp('2124-09-22 15:52:00'), deathtime=NaT, procedures=['Other phototherapy', 'Prophylactic administration of vaccine against other diseases'], prescriptions=[], diagnoses=[], labevent=[])])

In [37]:
def print_structure(data, indent=0):
    """
    递归打印数据结构的内容。
    :param data: 要打印的数据结构（如 Patient 或 Visit 对象）。
    :param indent: 当前缩进级别。
    """
    if isinstance(data, Patient):
        print("  " * indent + f"Patient(expire_flag={data.expire_flag}, gender = {data.gender}, visits=[")
        for visit in data.visits:
            print_structure(visit, indent + 1)
        print("  " * indent + "])")

    elif isinstance(data, Visit):
        print("  " * indent + f"Visit(visit_id={data.visit_id},admittime={data.admittime}, dischtime={data.dischtime}, deathtime={data.deathtime},")
        print("  " * indent + "procedures=[")
        for procedure in data.procedures:
            print("  " * (indent + 1) + f"'{procedure}',")
        print("  " * indent + "],")
        print("  " * indent + "prescriptions=[")
        for prescription in data.prescriptions:
            print("  " * (indent + 1) + f"'{prescription}',")
        print("  " * indent + "],")
        print("  " * indent + "diagnoses=[")
        for diagnosis in data.diagnoses:
            print("  " * (indent + 1) + f"'{diagnosis}',")
        print("  " * indent + "],")
        print("  " * indent + "labevent=[")
        for lab in data.labevent:
            print("  " * (indent + 1) + "{")
            for key, value in lab.items():
                print("  " * (indent + 2) + f"{key}: {value},")
            print("  " * (indent + 1) + "},")
        print("  " * indent + "])")

In [38]:
# 打印 单个病人的 ehr 数据
print_structure(ehrBase['249'].visits[1])

Visit(visit_id=149546,admittime=2155-02-03 20:16:00, dischtime=2155-02-14 11:15:00, deathtime=NaT,
procedures=[
  'Endovascular removal of obstruction from head and neck vessel(s)',
  'Procedure on single vessel',
  'Arteriography of cerebral arteries',
  'Injection or infusion of thrombolytic agent',
],
prescriptions=[
  '1000 ML sodium chloride 9 MG/ML Injection',
  'atorvastatin 40 MG Oral Tablet [Lipitor]',
  'NITROFURANTOIN, MACROCRYSTALS 25 MG / Nitrofurantoin, Monohydrate 75 MG Oral Capsule [Macrobid]',
  'pantoprazole 40 MG Injection [Protonix]',
  '0',
  '50 ML glucose 500 MG/ML Prefilled Syringe',
  'glucagon (rDNA) 1 MG Injection [GlucaGen]',
  'TAB-A-VITE TABLET',
  'Nitroglycerin 0.3 MG Sublingual Tablet [Nitroquick]',
  'levothyroxine sodium 0.075 MG Oral Tablet [Synthroid]',
  '1 ML morphine sulfate 4 MG/ML Injection',
  '1 ML morphine sulfate 2 MG/ML Prefilled Syringe',
  'ipratropium bromide 0.2 MG/ML Inhalation Solution',
  '120 ACTUAT fluticasone propionate 0.11 MG/A