In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# 抽取高血压患者subject_id、hadm_id

In [None]:
# 读取诊断记录，根据高血压icd_code抽取高血压诊断
diag_icd = pd.read_csv(r'E:\06.数据集备份\mimic-iv-2.2\hosp\diagnoses_icd.csv', encoding='utf-8')  # 读取诊断记录
diag_hyp = diag_icd.loc[diag_icd['icd_code'] == '4019']  # 按照icd_code抽取高血压诊断
diag_hyp.drop(columns=['seq_num', 'icd_code', 'icd_version'], axis=1, inplace=True)
diag_hyp.drop_duplicates(subset=['subject_id'], keep='first', inplace=True)  # 按患者唯一ID：subject_id去重
diag_hyp.to_csv('diag_hyp.csv', index=False)  # 高血压病人
print('高血压患者有：', len(diag_hyp))
diag_hyp.head(5)

In [None]:
# 将高血压病人id和年龄性别信息合并起来
patients = pd.read_csv(r'E:\06.数据集备份\mimic-iv-2.2\hosp\patients.csv', encoding='utf-8')  # 病人信息
diag_hyp = pd.read_csv(r'diag_hyp.csv', encoding='utf-8')  # 高血压病人ID
mid_hyp = pd.merge(diag_hyp, patients, on=['subject_id'], how='left')  # 高血压病人ID+具体病人信息
mid_hyp.drop(columns=['hadm_id', 'anchor_year', 'anchor_year_group', 'gender', 'dod'], axis=1, inplace=True)  # 去除无关信息
mid_hyp.drop_duplicates(subset=['subject_id'], keep='first', inplace=True)  ## 按患者唯一ID：subject_id去重
mid_hyp.to_csv('patients_mid.csv', index=False)  # 输出不显示索引值
print('高血压患者有：', len(mid_hyp))
mid_hyp.head(5)

# 高血压diagnoses_icd.csv

In [None]:
# 原diagnosis表
diag = pd.read_csv(r'E:\06.数据集备份\mimic-iv-2.2\hosp\diagnoses_icd.csv')
diag

In [None]:
# 高血压diagnosis表
diag_hyp = pd.merge(mid_hyp, diag, on=['subject_id'])
diag_hyp.drop(columns=['anchor_age'], axis=1, inplace=True)
diag_hyp.to_csv(r'E:\06.数据集备份\mimic-iv-高血压\diagnoses_icd.csv', index=False)
print('高血压患者诊断有：', len(diag_hyp))
diag_hyp.head(5)

In [None]:
diag = pd.read_csv(r'E:\06.数据集备份\mimic-iv-高血压\diagnoses_icd.csv')
count = diag['icd_code'].value_counts()
count

# 高血压admissions.csv

In [None]:
# 原admission表
adm = pd.read_csv(r'E:\06.数据集备份\mimic-iv-2.2\hosp\admissions.csv')
adm.head(5)

In [None]:
# 高血压admission表
adm_hyp = pd.merge(mid_hyp, adm, on=['subject_id'])
adm_hyp.drop(columns=['anchor_age'], axis=1, inplace=True)
adm_hyp.to_csv(r'E:\06.数据集备份\mimic-iv-高血压\admissions.csv', index=False)
print('高血压患者admission有：', len(adm_hyp))
adm_hyp.head(5)

# 高血压procedures_icd.csv

In [None]:
# 原procedures表
pro = pd.read_csv(r'E:\06.数据集备份\mimic-iv-2.2\hosp\procedures_icd.csv')
pro = pro['icd_version'] == 9
pro.head(5)

In [None]:
# 高血压procedures表
pro_hyp = pd.merge(mid_hyp, pro, on=['subject_id'])
pro_hyp.drop(columns=['anchor_age'], axis=1, inplace=True)
pro_hyp.to_csv(r'E:\06.数据集备份\mimic-iv-高血压\procedures_icd.csv', index=False)
print('高血压患者procedure有：', len(pro_hyp))
pro_hyp.head(5)

# 高血压prescriptions.csv

In [None]:
# 原prescriptions表
pre = pd.read_csv(r'E:\06.数据集备份\mimic-iv-2.2\hosp\prescriptions.csv', dtype={'ndc':'category'})
pre.head(5)

In [None]:
# 高血压prescriptions表
pre_hyp = pd.merge(mid_hyp, pre, on=['subject_id'])
pre_hyp.drop(columns=['anchor_age'], axis=1, inplace=True)
pre_hyp.to_csv(r'E:\06.数据集备份\mimic-iv-高血压\prescriptions_all.csv', index=False)
print('高血压患者prescription有：', len(pre_hyp))
pre_hyp.head(5)