In [1]:
import os
DATA_PATH = "./mimic-iv-2.2"

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)  # 显示完整文本

# 1. 读取疾病字典表
d_icd_path = os.path.join(DATA_PATH, 'hosp', 'd_icd_diagnoses.csv.gz')
df_dictionary = pd.read_csv(d_icd_path)

# 2. 搜索包含 "Multiple myeloma" 的记录（不区分大小写）
mm_codes = df_dictionary[df_dictionary['long_title'].str.contains('Multiple myeloma', case=False, na=False)]

# 3. 打印
print("--- 找到的相关疾病代码如下 ---")
print(mm_codes[['icd_code', 'icd_version', 'long_title']])

--- 找到的相关疾病代码如下 ---
      icd_code  icd_version  \
1899     20300            9   
1900     20301            9   
1901     20302            9   
14838      C90           10   
14839     C900           10   
14840    C9000           10   
14841    C9001           10   
14842    C9002           10   

                                                           long_title  
1899   Multiple myeloma, without mention of having achieved remission  
1900                                   Multiple myeloma, in remission  
1901                                     Multiple myeloma, in relapse  
14838            Multiple myeloma and malignant plasma cell neoplasms  
14839                                                Multiple myeloma  
14840                  Multiple myeloma not having achieved remission  
14841                                   Multiple myeloma in remission  
14842                                     Multiple myeloma in relapse  


### 多发性骨髓瘤相关 ICD 代码对照表

| 数据库存储格式 | 标准临床格式 | ICD 版本 | 英文名称 (long_title)                                      | 中文翻译                             |
|----------------|--------------|----------|------------------------------------------------------------|--------------------------------------|
| 20300          | 203.00       | ICD-9    | Multiple myeloma, without mention of having achieved remission | 多发性骨髓瘤，未提及是否达到缓解     |
| 20301          | 203.01       | ICD-9    | Multiple myeloma, in remission                             | 多发性骨髓瘤，缓解期                 |
| 20302          | 203.02       | ICD-9    | Multiple myeloma, in relapse                               | 多发性骨髓瘤，复发期                 |
| C90            | C90          | ICD-10   | Multiple myeloma and malignant plasma cell neoplasms       | 多发性骨髓瘤及恶性浆细胞肿瘤（总类目）|
| C900           | C90.0        | ICD-10   | Multiple myeloma                                           | 多发性骨髓瘤                         |
| C9000          | C90.00       | ICD-10   | Multiple myeloma not having achieved remission             | 多发性骨髓瘤，未达缓解               |
| C9001          | C90.01       | ICD-10   | Multiple myeloma in remission                              | 多发性骨髓瘤，缓解期                 |
| C9002          | C90.02       | ICD-10   | Multiple myeloma in relapse                                | 多发性骨髓瘤，复发期                 |

- 其中总类目不使用

In [None]:
import pandas as pd
import os

# 1. 加载数据（假设你已经定义了 DATA_PATH）
diagnoses_path = os.path.join(DATA_PATH, 'hosp', 'diagnoses_icd.csv.gz')
df_diag = pd.read_csv(diagnoses_path)

# 2. 定义精准匹配的代码列表
icd9_mm_codes = ['20300', '20301', '20302']  
icd10_mm_codes = ['C900', 'C9000', 'C9001', 'C9002']

# 3. 执行精准筛选
# 使用 .isin() 函数检查代码是否在列表中
mm_patients = df_diag[
    ((df_diag['icd_version'] == 10) & (df_diag['icd_code'].isin(icd10_mm_codes))) |
    ((df_diag['icd_version'] == 9) & (df_diag['icd_code'].isin(icd9_mm_codes)))
].copy()

# 4. 获取唯一患者名单
mm_patient_ids = mm_patients['subject_id'].unique()

output_path = 'mm_patients.csv'
mm_patients.to_csv(output_path, index=False)
print(f"\n✓ 已保存至: {output_path}")

print(f"找到精准匹配的 MM 记录共: {len(mm_patients)} 条")
print(f"涉及独立患者人数: {len(mm_patient_ids)} 人")

# 看看前几行数据长什么样
print("\n前5行筛选数据预览：")
print(mm_patients.head())


✓ 已保存至: ./mimic-iv-2.2\mm_patients.csv
找到精准匹配的 MM 记录共: 2857 条
涉及独立患者人数: 699 人

前5行筛选数据预览：
      subject_id   hadm_id  seq_num icd_code  icd_version
2242    10003400  20214994        6    20300            9
2275    10003400  22390287        3    20300            9
2294    10003400  23559586        7    C9000           10
2327    10003400  26090619        2    20300            9
2338    10003400  26467376        5    20300            9
