The following command attempts to fix some issues (ICU stay ID is missing) and removes the events that have missing information

In [1]:
import pandas as pd
import os
import numpy as np

from tqdm import tqdm

subjects_root_path = 'data/root'

In [2]:
def is_subject_folder(x):
    # isdigit() 方法检测字符串是否只由数字组成。
    return str.isdigit(x)
subdirectories = os.listdir(subjects_root_path)
# 得到文件名称全部是数字的文件夹
subjects = list(filter(is_subject_folder, subdirectories))
# 所有的icustays的病人个数
print(len(subjects))

33798


### 只是小测试，看看isnull.any()输出是什么

In [3]:
""" s = 3
#stays_df = pd.read_csv(os.path.join(subjects_root_path, str(s), 'stays.csv'), index_col=False,
                            #dtype={'HADM_ID': str, "ICUSTAY_ID": str})
#print(stays_df)
#stays_df['ICUSTAY_ID'].isnull().any()

list1 = [None, 1, 2, None, None]
isnull_df = pd.Series(list1)
print(isnull_df)
# 判断isnull_df里面是否有空值，只要有空值就为True
print(isnull_df.isnull().any())
print(isnull_df.isnull().sum()) """

' s = 3\n#stays_df = pd.read_csv(os.path.join(subjects_root_path, str(s), \'stays.csv\'), index_col=False,\n                            #dtype={\'HADM_ID\': str, "ICUSTAY_ID": str})\n#print(stays_df)\n#stays_df[\'ICUSTAY_ID\'].isnull().any()\n\nlist1 = [None, 1, 2, None, None]\nisnull_df = pd.Series(list1)\nprint(isnull_df)\n# 判断isnull_df里面是否有空值，只要有空值就为True\nprint(isnull_df.isnull().any())\nprint(isnull_df.isnull().sum()) '

In [4]:
n_events = 0                   # total number of events
empty_hadm = 0                 # HADM_ID is empty in events.csv. We exclude such events.
no_hadm_in_stay = 0            # HADM_ID does not appear in stays.csv. We exclude such events.
no_icustay = 0                 # ICUSTAY_ID is empty in events.csv. We try to fix such events.
recovered = 0                  # empty ICUSTAY_IDs are recovered according to stays.csv files (given HADM_ID)
could_not_recover = 0          # empty ICUSTAY_IDs that are not recovered. This should be zero.
icustay_missing_in_stays = 0   # ICUSTAY_ID does not appear in stays.csv. We exclude such events.

# 使用stay.csv修复events.csv的ICUSTAY_ID的nan

In [5]:
if os.path.exists('delete_nan_in_events_csv.logs'):
    os.remove('delete_nan_in_events_csv.logs')



In [6]:
import logging
logging.basicConfig(
    filename = 'delete_nan_in_events_csv.logs',
    filemode = 'a',
    format='%(asctime)s [%(levelname)s] %(message)s',
    datefmt = '%H:%M:%S',
    level = logging.DEBUG,
)
logger = logging.getLogger()


# 遍历这些病人的文件夹
num_subjects = len(subjects)
for subject in tqdm(subjects, total = num_subjects, desc='Iterating over subjects'):
    '''
    stays.csv的每行记录的内容是：病人每次住院的记录（每次住院只能有一个HADM-ID\icustay_id）
    '''

    log = f'当前处理的是{subject} 病人'
    logger.info(log)
    subject = str(subject)
    stays_df = pd.read_csv(os.path.join(subjects_root_path, subject, 'stays.csv'), index_col=False)
    #
    stays_df.columns = stays_df.columns.str.upper()
    #print(stays_df.dtypes)
    #assert that there is no row with empty ICUSTAY_ID or HADM_ID
    assert(not stays_df['ICUSTAY_ID'].isnull().any())
    assert(not stays_df['HADM_ID'].isnull().any())

    # assert there are no repetitions of ICUSTAY_ID or HADM_ID
    # since admissions with multiple ICU stays were excluded
    assert(len(stays_df['ICUSTAY_ID'].unique()) == len(stays_df['ICUSTAY_ID']))
    # 如果患者在住院期间发生了ICU的转移，就会出现同一个HADM_ID对应不同的ICUSTAY_ID，
    # 所以HADM_ID去重之后就与去重之前的个数不再相等
    assert(len(stays_df['HADM_ID'].unique()) == len(stays_df['HADM_ID']))

    events_df = pd.read_csv(os.path.join(subjects_root_path, subject, 'events.csv'), index_col=False)
    # 把ID里面的空值设置为-1， 然后把ID类型转换为int
    #print('events_df的数据类型：')
    #print(events_df.dtypes)
    events_df.columns = events_df.columns.str.upper()
    n_events += events_df.shape[0]
    nan_hadm = events_df['HADM_ID'].isnull().sum()
    empty_hadm += nan_hadm
    #print(f'住院ID为空的记录个数为： {empty_hadm}')
    log = f'住院ID为空的记录个数为： {nan_hadm}'
    logger.info(log)
    # we drop all events for them HADM_ID is empty
    # TODO: maybe we can recover HADM_ID by looking at ICUSTAY_ID
    # 删除掉events.csv中住院ID HADM_ID为空的数据
    events_df = events_df.dropna(subset=['HADM_ID'])
    """ 
    events.csv中的数据的ICUSTAY_ID仍可能为空；
    这里没有立刻删除ICUSTAY_ID为空的数据是想要根据HADM_ID来修复
    """
    #events_df = events_df.dropna(subset=['ICUSTAY_ID'])
    events_df[['HADM_ID', 'ICUSTAY_ID']] = events_df[['HADM_ID', 'ICUSTAY_ID']].fillna(-1).astype(int)
    #print('stays_df: ')
    #print(stays_df)
    #print('填补nan为-1、且将ID转换为Int的events_df')
    #print(events_df)

    merged_df = events_df.merge(stays_df, on=['HADM_ID'], how='left', suffixes=['', '_r'], indicator=True)
    #print('events_df和stays_df在HADM_ID是left合并的结果：')
    #print(merged_df)
    # we drop all events for which HADM_ID is not listed in stays.csv
    # since there is no way to know the targets of that stay (for example mortality)
    no_hadm_in_stay += (merged_df['_merge'] == 'left_only').sum()
    merged_df = merged_df[merged_df['_merge'] == 'both']

    # 转换ICUSTAY_ID的-1为Nan
    import numpy as np
    mask_non = merged_df['ICUSTAY_ID'] == -1
    merged_df.loc[mask_non, 'ICUSTAY_ID'] = np.nan
    #print(merged_df.loc[mask_non])
    #print(merged_df)

    # if ICUSTAY_ID is empty in stays.csv, we try to recover it
    # we exclude all events for which we could not recover ICUSTAY_ID
    cur_no_icustay = merged_df['ICUSTAY_ID'].isnull().sum()
    #print(f'当前events.csv里面还有 {cur_no_icustay} 条ICUSTAY_ID为NAN的记录')
    log = f'当前events.csv里面还有 {cur_no_icustay} 条ICUSTAY_ID为NAN的记录'
    logger.info(log)
    no_icustay += cur_no_icustay
    # 使用stays.csv中与events.csv的HADM_ID相对应的ICUSTAY_ID来补充events.csv中的None
    merged_df.loc[:, 'ICUSTAY_ID'] = merged_df['ICUSTAY_ID'].fillna(merged_df['ICUSTAY_ID_r'])
    cur_cover = cur_no_icustay - merged_df['ICUSTAY_ID'].isnull().sum()
    recovered += cur_cover
    cur_not_recover = merged_df['ICUSTAY_ID'].isnull().sum()
    could_not_recover += cur_not_recover
    #print(f'修复了 {cur_cover} 条ICUSTAY_ID为NAN的记录')
    #print(f'还剩下{cur_not_recover}条无法修复， 这些记录将被删除')
    log = f'修复了 {cur_cover} 条ICUSTAY_ID为NAN的记录, 还剩下{cur_not_recover}条无法修复， 这些记录将被删除'
    logger.info(log)
    merged_df = merged_df.dropna(subset=['ICUSTAY_ID'])

    # now we take a look at the case when ICUSTAY_ID is present in events.csv, but not in stays.csv
    # this mean that ICUSTAY_ID in events.csv is not the same as that of stays.csv for the same HADM_ID
    # we drop all such events
    icustay_missing_in_stays += (merged_df['ICUSTAY_ID'] != merged_df['ICUSTAY_ID_r']).sum()
    merged_df = merged_df[(merged_df['ICUSTAY_ID'] == merged_df['ICUSTAY_ID_r'])]

    to_write = merged_df[['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CHARTTIME', 'ITEMID', 'VALUE', 'VALUEUOM']]
    to_write.to_csv(os.path.join(subjects_root_path, subject, 'events.csv'), index=False)
print('n_events: {}'.format(n_events))
print('empty_hadm: {}'.format(empty_hadm))
print('no_hadm_in_stay: {}'.format(no_hadm_in_stay))
print('no_icustay: {}'.format(no_icustay))
print('recovered: {}'.format(recovered))
print('could_not_recover: {}'.format(could_not_recover))
print('icustay_missing_in_stays: {}'.format(icustay_missing_in_stays))

log = 'n_events: {}; \n empty_hadm: {}; \n no_hadm_in_stay: {} \n no_icustay: {} \n recovered: {} \n could_not_recover: {} \n icustay_missing_in_stays: {} '.format(n_events, empty_hadm, no_hadm_in_stay, no_icustay, recovered, could_not_recover, icustay_missing_in_stays)

Iterating over subjects: 100%|██████████| 33798/33798 [32:07<00:00, 17.54it/s]n_events: 252617853
empty_hadm: 5139379
no_hadm_in_stay: 31810107
no_icustay: 15556328
recovered: 15556328
could_not_recover: 0
icustay_missing_in_stays: 7096130



In [7]:
#a = merged_df.loc[mask_non, 'ICUSTAY_ID'] 

In [8]:
#prirged_df.loc[:, 'ICUSTAY_ID'])

In [9]:
'''
if 'ICUSTAY_ID' not in labevents:
    labevents['ICUSTAY_ID'] = ''
for s in subjects:
    # 写入lab和out信息
    print(f'当前处理的病人ID是: {s}')
    mask_lab = labevents['SUBJECT_ID'] == s
    mask_out = outputevents['SUBJECT_ID'] == s
    lab = labevents[mask_lab][obs_header]
    print('*************lab 得到的信息***************')
    print(lab)
    out = outputevents[mask_out][obs_header]
    print('**************out 得到的信息***************')
    print(out)
 '''   

"\nif 'ICUSTAY_ID' not in labevents:\n    labevents['ICUSTAY_ID'] = ''\nfor s in subjects:\n    # 写入lab和out信息\n    print(f'当前处理的病人ID是: {s}')\n    mask_lab = labevents['SUBJECT_ID'] == s\n    mask_out = outputevents['SUBJECT_ID'] == s\n    lab = labevents[mask_lab][obs_header]\n    print('*************lab 得到的信息***************')\n    print(lab)\n    out = outputevents[mask_out][obs_header]\n    print('**************out 得到的信息***************')\n    print(out)\n "