In [3]:
import os
import pandas as pd
import numpy as np
import csv

from tqdm import tqdm
from pathlib import Path


In [6]:
mimic3_path = 'MIMIC-III Clinical Database 1.4'
output_path = 'data/root'
event_tables=['CHARTEVENTS', 'LABEVENTS', 'OUTPUTEVENTS']
#items_table = 'D_ITEMS'
#l_items_table = 'D_LABITEMS'

In [7]:
try:
    os.makedirs(output_path)
except:
    pass

## 处理患者的ICUSTAY、PATIENT、ADMISSION表，根据住院期间是否转换过ICU病房、年龄是否大于18等过滤一部分信息；
## 计算患者在ICU期间以及住院期间是否死亡

从数据库中取出来的数据，一般是object，先转成datetime格式

dt.date: the date part of Timestamps without timezone information

In [8]:
patients = pd.read_csv(os.path.join(mimic3_path, 'PATIENTS.csv'))
# DOB is the date of birth of the given patient。 年龄大于89的这个数据会变化
# DOD is the date of death for the given patient
patients = patients[['SUBJECT_ID', 'GENDER', 'DOB', 'DOD']]
print(patients.dtypes)
patients.DOB = pd.to_datetime(patients.DOB).dt.date
patients.DOD = pd.to_datetime(patients.DOD).dt.date
print(patients.head())

SUBJECT_ID     int64
GENDER        object
DOB           object
DOD           object
dtype: object
   SUBJECT_ID GENDER         DOB         DOD
0         249      F  2075-03-13         NaT
1         250      F  2164-12-27  2188-11-22
2         251      M  2090-03-15         NaT
3         252      M  2078-03-06         NaT
4         253      F  2089-11-26         NaT


In [9]:
admits = pd.read_csv(os.path.join(mimic3_path, 'ADMISSIONS.csv'))
admits = admits[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ETHNICITY', 'DIAGNOSIS']]
#print(admits)
admits.ADMITTIME = pd.to_datetime(admits.ADMITTIME).dt.date
admits.DISCHTIME = pd.to_datetime(admits.DISCHTIME).dt.date
admits.DEATHTIME = pd.to_datetime(admits.DEATHTIME).dt.date
print(admits.head())

   SUBJECT_ID  HADM_ID   ADMITTIME   DISCHTIME DEATHTIME ETHNICITY  \
0          22   165315  2196-04-09  2196-04-10       NaT     WHITE   
1          23   152223  2153-09-03  2153-09-08       NaT     WHITE   
2          23   124321  2157-10-18  2157-10-25       NaT     WHITE   
3          24   161859  2139-06-06  2139-06-09       NaT     WHITE   
4          25   129635  2160-11-02  2160-11-05       NaT     WHITE   

                                           DIAGNOSIS  
0                            BENZODIAZEPINE OVERDOSE  
1  CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...  
2                                         BRAIN MASS  
3                     INTERIOR MYOCARDIAL INFARCTION  
4                            ACUTE CORONARY SYNDROME  


In [10]:
stays = pd.read_csv(os.path.join(mimic3_path, 'ICUSTAYS.csv'))
stays.INTIME = pd.to_datetime(stays.INTIME).dt.date
stays.OUTTIME = pd.to_datetime(stays.OUTTIME).dt.date
print(stays.head())

   ROW_ID  SUBJECT_ID  HADM_ID  ICUSTAY_ID DBSOURCE FIRST_CAREUNIT  \
0     365         268   110404      280836  carevue           MICU   
1     366         269   106296      206613  carevue           MICU   
2     367         270   188028      220345  carevue            CCU   
3     368         271   173727      249196  carevue           MICU   
4     369         272   164716      210407  carevue            CCU   

  LAST_CAREUNIT  FIRST_WARDID  LAST_WARDID      INTIME     OUTTIME     LOS  
0          MICU            52           52  2198-02-14  2198-02-18  3.2490  
1          MICU            52           52  2170-11-05  2170-11-08  3.2788  
2           CCU            57           57  2128-06-24  2128-06-27  2.8939  
3          SICU            52           23  2120-08-07  2120-08-10  2.0600  
4           CCU            57           57  2186-12-25  2186-12-27  1.6202  


### 统计最原始的数据个数

In [11]:
print('START:\n\tICUSTAY_IDs: {}\n\tHADM_IDs: {}\n\tSUBJECT_IDs: {}'.format(stays.ICUSTAY_ID.unique().shape[0],
          stays.HADM_ID.unique().shape[0], stays.SUBJECT_ID.unique().shape[0]))


START:
	ICUSTAY_IDs: 61532
	HADM_IDs: 57786
	SUBJECT_IDs: 46476


### remove icustays  with  transfers
去掉在ICU内发生过病房转移（从一个ICU病房转移到另一个ICU病房 或 ICU护理类型发生过变化的患者

这里的代码逻辑是只有没有发生过病房转移且ICU护理类型没有发生过变化的患者才可以保留下来

In [12]:
mask1 = stays.FIRST_WARDID == stays.LAST_WARDID
mask2 = stays.FIRST_CAREUNIT == stays.LAST_CAREUNIT
""" print(mask1)
print(mask2)
print(mask1 & mask2) """

' print(mask1)\nprint(mask2)\nprint(mask1 & mask2) '

In [51]:
s = pd.Series([1, 2, 3])
s1 = pd.Series([True, False, False])
s2 = pd.Series([True, False, True])
print(s1 & s2)
print(s[s1 & s2])
print(s[s1 | s2])

0     True
1    False
2    False
dtype: bool
0    1
dtype: int64
0    1
2    3
dtype: int64


In [13]:
stay_no_transfer = stays[(stays.FIRST_WARDID == stays.LAST_WARDID) & (stays.FIRST_CAREUNIT == stays.LAST_CAREUNIT)]
icustays = stay_no_transfer[['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'LAST_CAREUNIT', 'DBSOURCE', 'INTIME', 'OUTTIME', 'LOS']]


In [14]:
print('REMOVE ICU TRANSFERS:\n\tICUSTAY_IDs: {}\n\tHADM_IDs: {}\n\tSUBJECT_IDs: {}'.format(icustays.ICUSTAY_ID.unique().shape[0], icustays.HADM_ID.unique().shape[0], icustays.SUBJECT_ID.unique().shape[0]))

REMOVE ICU TRANSFERS:
	ICUSTAY_IDs: 55830
	HADM_IDs: 52834
	SUBJECT_IDs: 43277


### 合并表的操作

1 合并stays和admits

保留有入住icu记录的住院患者信息

2 合并stays和patients


In [15]:
icustays = icustays.merge(admits, how='inner', left_on=['SUBJECT_ID', 'HADM_ID'], right_on=['SUBJECT_ID', 'HADM_ID'])
print(icustays.columns)
print(icustays.head())


Index(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'LAST_CAREUNIT', 'DBSOURCE',
       'INTIME', 'OUTTIME', 'LOS', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ETHNICITY', 'DIAGNOSIS'],
      dtype='object')
   SUBJECT_ID  HADM_ID  ICUSTAY_ID LAST_CAREUNIT DBSOURCE      INTIME  \
0         268   110404      280836          MICU  carevue  2198-02-14   
1         269   106296      206613          MICU  carevue  2170-11-05   
2         270   188028      220345           CCU  carevue  2128-06-24   
3         272   164716      210407           CCU  carevue  2186-12-25   
4         273   158689      241507          MICU  carevue  2141-04-19   

      OUTTIME     LOS   ADMITTIME   DISCHTIME   DEATHTIME  \
0  2198-02-18  3.2490  2198-02-11  2198-02-18  2198-02-18   
1  2170-11-08  3.2788  2170-11-05  2170-11-27         NaT   
2  2128-06-27  2.8939  2128-06-23  2128-06-27         NaT   
3  2186-12-27  1.6202  2186-12-25  2187-01-02         NaT   
4  2141-04-20  1.4862  2141-04-19  2141-04-20      

In [16]:
icustays = icustays.merge(patients, how='inner', left_on=['SUBJECT_ID'], right_on=['SUBJECT_ID'])
print(icustays.columns)
print(icustays.head())

Index(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'LAST_CAREUNIT', 'DBSOURCE',
       'INTIME', 'OUTTIME', 'LOS', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ETHNICITY', 'DIAGNOSIS', 'GENDER', 'DOB', 'DOD'],
      dtype='object')
   SUBJECT_ID  HADM_ID  ICUSTAY_ID LAST_CAREUNIT DBSOURCE      INTIME  \
0         268   110404      280836          MICU  carevue  2198-02-14   
1         269   106296      206613          MICU  carevue  2170-11-05   
2         270   188028      220345           CCU  carevue  2128-06-24   
3         272   164716      210407           CCU  carevue  2186-12-25   
4         273   158689      241507          MICU  carevue  2141-04-19   

      OUTTIME     LOS   ADMITTIME   DISCHTIME   DEATHTIME  \
0  2198-02-18  3.2490  2198-02-11  2198-02-18  2198-02-18   
1  2170-11-08  3.2788  2170-11-05  2170-11-27         NaT   
2  2128-06-27  2.8939  2128-06-23  2128-06-27         NaT   
3  2186-12-27  1.6202  2186-12-25  2187-01-02         NaT   
4  2141-04-20  1.4862  2141

### 使用HADM进行分组(hadm相同的计为一组)
REMOVE MULTIPLE STAYS PER ADMIT:\n\tICUSTAY_IDs

也是为了移除住过不同ICU病房的病例

In [17]:
# groupby('HADM_ID').count()是把hadm_id当做index，计算其余各列在该index下有多少个取值
to_keep = icustays.groupby('HADM_ID').count()
to_keep = to_keep[['ICUSTAY_ID']]
# 释放index，即把hadm_id重新放到列位置
to_keep = to_keep.reset_index()
mask_min = to_keep.ICUSTAY_ID >= 1
mask_max = to_keep.ICUSTAY_ID <= 1
# to_keep保存的是住院期间只去过一种类型的ICU
to_keep = to_keep[mask_min & mask_max][['HADM_ID']]

In [18]:
icustays = icustays.merge(to_keep, how='inner', on=['HADM_ID'])

In [19]:
print('REMOVE MULTIPLE STAYS PER ADMIT:\n\tICUSTAY_IDs: {}\n\tHADM_IDs: {}\n\tSUBJECT_IDs: {}'.format(icustays.ICUSTAY_ID.unique().shape[0], icustays.HADM_ID.unique().shape[0], icustays.SUBJECT_ID.unique().shape[0]))

REMOVE MULTIPLE STAYS PER ADMIT:
	ICUSTAY_IDs: 50186
	HADM_IDs: 50186
	SUBJECT_IDs: 41587


In [20]:
icustays

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,LAST_CAREUNIT,DBSOURCE,INTIME,OUTTIME,LOS,ADMITTIME,DISCHTIME,DEATHTIME,ETHNICITY,DIAGNOSIS,GENDER,DOB,DOD
0,268,110404,280836,MICU,carevue,2198-02-14,2198-02-18,3.2490,2198-02-11,2198-02-18,2198-02-18,HISPANIC OR LATINO,DYSPNEA,F,2132-02-21,2198-02-18
1,269,106296,206613,MICU,carevue,2170-11-05,2170-11-08,3.2788,2170-11-05,2170-11-27,NaT,WHITE,SEPSIS;PILONIDAL ABSCESS,M,2130-09-30,NaT
2,270,188028,220345,CCU,carevue,2128-06-24,2128-06-27,2.8939,2128-06-23,2128-06-27,NaT,UNKNOWN/NOT SPECIFIED,CAROTID STENOSIS\CAROTID ANGIOGRAM AND STENT,M,2048-05-26,NaT
3,272,164716,210407,CCU,carevue,2186-12-25,2186-12-27,1.6202,2186-12-25,2187-01-02,NaT,WHITE,PULMONARY EMBOLIS,M,2119-11-21,NaT
4,273,158689,241507,MICU,carevue,2141-04-19,2141-04-20,1.4862,2141-04-19,2141-04-20,NaT,BLACK/AFRICAN AMERICAN,POLYSUBSTANCE OVERDOSE,M,2107-08-10,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50181,94944,143774,201233,CSRU,metavision,2104-04-15,2104-04-17,2.1894,2104-04-11,2104-04-20,NaT,WHITE,CHEST PAIN;RULE OUT CORONARY ARTERY DISEASE\LE...,M,2027-03-02,NaT
50182,94950,123750,283653,CCU,metavision,2155-12-08,2155-12-10,2.4942,2155-12-07,2155-12-12,NaT,WHITE,SYNCOPE;TELEMETRY,F,1855-12-07,NaT
50183,94953,196881,241585,SICU,metavision,2160-03-03,2160-03-04,0.9259,2160-03-03,2160-03-04,NaT,WHITE,CEREBRAL ANEURYSM/SDA,F,2107-01-29,2162-01-05
50184,94954,118475,202802,CSRU,metavision,2183-03-25,2183-03-27,2.3346,2183-03-25,2183-04-01,NaT,PATIENT DECLINED TO ANSWER,AORTIC STENOSIS\AORTIC VALVE REPLACEMENT /SDA,F,2115-05-23,NaT


In [21]:
icustays.apply(lambda e: (e['INTIME'] - e['DOB']).days/365, axis=1)

0         66.027397
1         40.126027
2         80.131507
3         67.139726
4         33.715068
            ...    
50181     77.172603
50182    300.202740
50183     53.128767
50184     67.884932
50185     34.241096
Length: 50186, dtype: float64

### 计算病人入住ICU时的年龄

In [22]:
icustays['age'] = icustays.apply(lambda e: (e['INTIME'] - e['DOB']).days/365, axis=1)

#### 筛选出年龄大于等于18岁的病人

In [23]:
import numpy as np
icustays = icustays[(icustays.age >= 18) & (icustays.age <= np.inf)]

In [24]:
print('REMOVE PATIENTS AGE < 18:\n\tICUSTAY_IDs: {}\n\tHADM_IDs: {}\n\tSUBJECT_IDs: {}'.format(icustays.ICUSTAY_ID.unique().shape[0], icustays.HADM_ID.unique().shape[0], icustays.SUBJECT_ID.unique().shape[0]))


REMOVE PATIENTS AGE < 18:
	ICUSTAY_IDs: 42276
	HADM_IDs: 42276
	SUBJECT_IDs: 33798


## 添加在住院期间和在ICU期间内死亡的标志

In [25]:
def add_inhospital_mortality_to_icustays(stays):
    """   DOD是从patient表得到的患者死亡日期； DEATHTIME是从admission表得到的患者死亡日期；
        ADMITTIME和DISCHTIME分别为入院和出院时间；
        如果患者的任何一个死亡日期在入院期间内，视为患者在住院期间死亡
    """
    mortality_DOD = stays.DOD.notnull() & ((stays.ADMITTIME <= stays.DOD) & (stays.DISCHTIME >= stays.DOD))
    mortality_DEA = stays.DEATHTIME.notnull() & ((stays.ADMITTIME <= stays.DEATHTIME) & (stays.DISCHTIME >= stays.DEATHTIME))
    mortality_mask = mortality_DOD | mortality_DEA
    # 得到患者在住院期间是否死亡的标志
    stays['MORTALITY'] = mortality_mask.astype(int)
    stays['MORTALITY_INHOSPITAL'] = stays['MORTALITY']
    return stays


In [26]:
def add_inunit_mortality_to_icustays(stays):
    mortality_DOD = stays.DOD.notnull() & ((stays.INTIME <= stays.DOD) & (stays.OUTTIME >= stays.DOD))
    mortality_DEA = stays.DEATHTIME.notnull() & ((stays.INTIME <= stays.DEATHTIME) & (stays.OUTTIME >= stays.DEATHTIME))
    mortality_mask = mortality_DOD | mortality_DEA
    stays['MORTALITY_INUNIT'] = mortality_mask.astype(int)
    return stays

In [27]:
icustays = add_inhospital_mortality_to_icustays(icustays)
icustays = add_inunit_mortality_to_icustays(icustays)

### 把处理好的icustay、patient、admission表存入文件

In [175]:
icustays.to_csv(os.path.join(output_path, 'all_stays.csv'), index=False)

##  读取患者的ICD-9 diagnoses
D_ICD_DIAGNOSES.csv: Definition table for ICD diagnoses(诊断简介)

DIAGNOSES_ICD.csv: Contains ICD diagnoses for patients, most notably ICD-9 diagnoses.(包含患者诊断信息)

In [29]:

codes = pd.read_csv(os.path.join(mimic3_path, 'D_ICD_DIAGNOSES.csv'))
print(codes)


       ROW_ID ICD9_CODE               SHORT_TITLE  \
0         174     01166     TB pneumonia-oth test   
1         175     01170    TB pneumothorax-unspec   
2         176     01171   TB pneumothorax-no exam   
3         177     01172  TB pneumothorx-exam unkn   
4         178     01173  TB pneumothorax-micro dx   
...       ...       ...                       ...   
14562   14432     V7399     Scrn unspcf viral dis   
14563   14433      V740     Screening for cholera   
14564   14434      V741    Screening-pulmonary TB   
14565   14435      V742     Screening for leprosy   
14566   14436      V743  Screening for diphtheria   

                                              LONG_TITLE  
0      Tuberculous pneumonia [any form], tubercle bac...  
1                  Tuberculous pneumothorax, unspecified  
2      Tuberculous pneumothorax, bacteriological or h...  
3      Tuberculous pneumothorax, bacteriological or h...  
4      Tuberculous pneumothorax, tubercle bacilli fou...  
...      

In [63]:
diagnoses = pd.read_csv(os.path.join(mimic3_path, 'DIAGNOSES_ICD.csv'))
diagnoses

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254
...,...,...,...,...,...
651042,639798,97503,188195,2.0,20280
651043,639799,97503,188195,3.0,V5869
651044,639800,97503,188195,4.0,V1279
651045,639801,97503,188195,5.0,5275


In [64]:
diagnoses = diagnoses.merge(codes[['ICD9_CODE', 'SHORT_TITLE', 'LONG_TITLE']], how='inner', left_on='ICD9_CODE', right_on='ICD9_CODE')

In [65]:
diagnoses

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,1297,109,172335,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
1,1311,109,173633,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
2,1019,109,131345,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
3,1039,109,131376,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
4,1059,109,135923,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
...,...,...,...,...,...,...,...
598833,651009,99991,151118,13.0,5644,Postop GI funct dis NEC,Other postoperative functional disorders
598834,638699,97228,128194,8.0,9152,Blister finger,"Blister of finger(s), without mention of infec..."
598835,650539,99873,143544,12.0,2982,Reactive confusion,Reactive confusion
598836,631728,95806,167169,15.0,6940,Dermatitis herpetiformis,Dermatitis herpetiformis


In [66]:
diagnoses.dtypes

ROW_ID           int64
SUBJECT_ID       int64
HADM_ID          int64
SEQ_NUM        float64
ICD9_CODE       object
SHORT_TITLE     object
LONG_TITLE      object
dtype: object

In [67]:
diagnoses[['SUBJECT_ID', 'HADM_ID', 'SEQ_NUM']] = diagnoses[['SUBJECT_ID', 'HADM_ID', 'SEQ_NUM']].astype(int)

In [68]:
diagnoses.dtypes

ROW_ID          int64
SUBJECT_ID      int32
HADM_ID         int32
SEQ_NUM         int32
ICD9_CODE      object
SHORT_TITLE    object
LONG_TITLE     object
dtype: object

#### 把患者的icustay_id加入到diagnosis中

此时diagnosis包含患者入院ID：HADM_ID、患者ID：SSUBJECT_ID、ICD9_CODE、SHORT_TITLE、LONG\此时diagnosis包含患者入院ID：HADM_ID、患者ID：SSUBJECT_ID、ICD9_CODE、SHORT_TITLE、LONG_TITLE、ICUSTAY_ID

In [69]:
diagnoses = diagnoses.merge(icustays[['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID']].drop_duplicates(), how='inner',
                            on=['SUBJECT_ID', 'HADM_ID'])
print(diagnoses)

        ROW_ID  SUBJECT_ID  HADM_ID  SEQ_NUM ICD9_CODE  \
0         1297         109   172335        1     40301   
1         1298         109   172335        2       486   
2         1299         109   172335        3     58281   
3         1300         109   172335        4      5855   
4         1301         109   172335        5      4254   
...        ...         ...      ...      ...       ...   
477953  205925       18499   125788        1      1922   
477954  348882       30377   104237        1     80336   
477955  348883       30377   104237        2     E8852   
477956  290675       25951   115991        1      1885   
477957  523154       71275   157952        1     99561   

                     SHORT_TITLE  \
0         Mal hyp kid w cr kid V   
1        Pneumonia, organism NOS   
2       Chr nephritis in oth dis   
3       Chron kidney dis stage V   
4        Prim cardiomyopathy NEC   
...                          ...   
477953       Mal neo spinal cord   
477954  Cl skul

In [205]:
diagnoses.to_csv(os.path.join(output_path, 'all_diagnoses.csv'), index=False)

### 去掉ICD-9中信息完全一样的记录（一个也没去掉）

In [70]:
codes = codes.drop_duplicates()

In [75]:
ICD_STAY_COUNT = diagnoses.groupby('ICD9_CODE')['ICUSTAY_ID'].count()
ICD_STAY_COUNT = ICD_STAY_COUNT.reset_index()
ICD_STAY_COUNT = ICD_STAY_COUNT.rename(columns={'ICUSTAY_ID':'COUNT'})
print(ICD_STAY_COUNT)

     ICD9_CODE  COUNT
0         0030      2
1         0038      1
2         0039      1
3         0041      3
4         0048      1
...        ...    ...
6164     V8821      3
6165     V9010      1
6166     V9039      1
6167     V9081      3
6168     V9089      1

[6169 rows x 2 columns]


In [74]:
codes = codes.merge(ICD_STAY_COUNT, how='left', on=['ICD9_CODE'])

In [76]:
mask_c = codes.COUNT.notnull()
codes = codes[mask_c]

In [43]:
codes.to_csv(os.path.join(output_path, 'diagnosis_counts.csv'), index =False)

In [77]:
codes.sort_values('COUNT', ascending=False)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE,COUNT_x,COUNT_y,COUNT_x.1,COUNT_y.1,COUNT
1760,4304,4019,Hypertension NOS,Unspecified essential hypertension,17343.0,17343.0,17343.0,17343,17343
1901,4473,4280,CHF NOS,"Congestive heart failure, unspecified",10601.0,10601.0,10601.0,10601,10601
1814,4374,41401,Crnry athrscl natve vssl,Coronary atherosclerosis of native coronary ar...,10345.0,10345.0,10345.0,10345,10345
1890,4462,42731,Atrial fibrillation,Atrial fibrillation,10313.0,10313.0,10313.0,10313,10313
421,1591,25000,DMII wo cmp nt st uncntr,Diabetes mellitus without mention of complicat...,7486.0,7486.0,7486.0,7486,7486
...,...,...,...,...,...,...,...,...,...
4750,11482,E8156,Mv coll w obj-ped cycl,Other motor vehicle traffic accident involving...,1.0,1.0,1.0,1,1
2496,5602,61800,Vaginal wall prolpse NOS,Unspecified prolapse of vaginal walls,1.0,1.0,1.0,1,1
2498,5606,61804,Rectocele,Rectocele,1.0,1.0,1.0,1,1
2500,5610,6182,Uterovag prolaps-incompl,"Uterovaginal prolapse, incomplete",1.0,1.0,1.0,1,1


In [269]:
os.path.dirname("__file__")

''

# 25种表型识别，为第四个任务服务

In [45]:
phenotype_definitions = os.path.join(os.path.dirname("__file__"), '../resources/hcup_ccs_2015_definitions.yaml')

import yaml
definitions = yaml.load(open(phenotype_definitions, 'r'))


In [78]:
def_map = {}
""" 
def_map是建立ICD9_CODE 和  definitions的映射。
由ICD9_CODE 找 与其对应的definition和'use_in_benchmark'（True or Flase）
"""
for dx in definitions:
    for code in definitions[dx]['codes']:
        def_map[code] = (dx, definitions[dx]['use_in_benchmark'])

diagnoses['HCUP_CCS_2015'] = diagnoses.ICD9_CODE.apply(lambda c: def_map[c][0] if c in def_map else None)
diagnoses['USE_IN_BENCHMARK'] = diagnoses.ICD9_CODE.apply(lambda c: int(def_map[c][1]) if c in def_map else None)

In [79]:
diagnoses

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE,ICUSTAY_ID,HCUP_CCS_2015,USE_IN_BENCHMARK
0,1297,109,172335,1,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant...",262652,Hypertension with complications and secondary ...,1
1,1298,109,172335,2,486,"Pneumonia, organism NOS","Pneumonia, organism unspecified",262652,Pneumonia (except that caused by tuberculosis ...,1
2,1299,109,172335,3,58281,Chr nephritis in oth dis,Chronic glomerulonephritis in diseases classif...,262652,Nephritis; nephrosis; renal sclerosis,0
3,1300,109,172335,4,5855,Chron kidney dis stage V,"Chronic kidney disease, Stage V",262652,Chronic kidney disease,1
4,1301,109,172335,5,4254,Prim cardiomyopathy NEC,Other primary cardiomyopathies,262652,Peri-; endo-; and myocarditis; cardiomyopathy ...,0
...,...,...,...,...,...,...,...,...,...,...
477953,205925,18499,125788,1,1922,Mal neo spinal cord,Malignant neoplasm of spinal cord,217967,Cancer of brain and nervous system,0
477954,348882,30377,104237,1,80336,Cl skull fx NEC-coma NOS,Other closed skull fracture with other and uns...,276344,Intracranial injury,0
477955,348883,30377,104237,2,E8852,Fall from skateboard,Fall from skateboard,276344,Fall,0
477956,290675,25951,115991,1,1885,Mal neo bladder neck,Malignant neoplasm of bladder neck,202859,Cancer of bladder,0


### 25种急性表型分类

In [80]:
phenotypes = diagnoses
phenotypes = phenotypes[['ICUSTAY_ID', 'HCUP_CCS_2015']][phenotypes.USE_IN_BENCHMARK > 0].drop_duplicates()
phenotypes['VALUE'] = 1
print(phenotypes)

        ICUSTAY_ID                                      HCUP_CCS_2015  VALUE
0           262652  Hypertension with complications and secondary ...      1
1           262652  Pneumonia (except that caused by tuberculosis ...      1
3           262652                             Chronic kidney disease      1
5           262652                    Fluid and electrolyte disorders      1
9           262652  Complications of surgical procedures or medica...      1
...            ...                                                ...    ...
477837      250923                    Other lower respiratory disease      1
477866      217928                    Other upper respiratory disease      1
477867      298882                    Other upper respiratory disease      1
477886      287794                    Other lower respiratory disease      1
477911      217456  Pneumonia (except that caused by tuberculosis ...      1

[174081 rows x 3 columns]


In [81]:
phenotypes = phenotypes.pivot(index='ICUSTAY_ID', columns='HCUP_CCS_2015', values='VALUE')

In [82]:
phenotypes

HCUP_CCS_2015,Acute and unspecified renal failure,Acute cerebrovascular disease,Acute myocardial infarction,Cardiac dysrhythmias,Chronic kidney disease,Chronic obstructive pulmonary disease and bronchiectasis,Complications of surgical procedures or medical care,Conduction disorders,Congestive heart failure; nonhypertensive,Coronary atherosclerosis and other heart disease,...,Gastrointestinal hemorrhage,Hypertension with complications and secondary hypertension,Other liver diseases,Other lower respiratory disease,Other upper respiratory disease,Pleurisy; pneumothorax; pulmonary collapse,Pneumonia (except that caused by tuberculosis or sexually transmitted disease),Respiratory failure; insufficiency; arrest (adult),Septicemia (except in labor),Shock
ICUSTAY_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200001,,,,1.0,1.0,,,,1.0,,...,,1.0,,,,,,,,
200003,,,,,,,,,,,...,,,,,,,,,1.0,
200006,,,,,,,1.0,,,,...,,,,,,,,1.0,,
200007,,,1.0,1.0,,,1.0,,,1.0,...,,,,,,,,,,
200009,,,,,,,1.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299984,,,,1.0,,,1.0,,1.0,,...,,,,,,,1.0,,,
299987,,,,,,1.0,,,1.0,1.0,...,,,,,,,,,,
299992,,,,,,,,,,,...,,,,,,,,1.0,,
299995,,,,,,,,,,,...,,,,,1.0,,,,,


In [83]:
phenotypes = phenotypes.sort_index(ascending=True)

In [84]:
phenotypes

HCUP_CCS_2015,Acute and unspecified renal failure,Acute cerebrovascular disease,Acute myocardial infarction,Cardiac dysrhythmias,Chronic kidney disease,Chronic obstructive pulmonary disease and bronchiectasis,Complications of surgical procedures or medical care,Conduction disorders,Congestive heart failure; nonhypertensive,Coronary atherosclerosis and other heart disease,...,Gastrointestinal hemorrhage,Hypertension with complications and secondary hypertension,Other liver diseases,Other lower respiratory disease,Other upper respiratory disease,Pleurisy; pneumothorax; pulmonary collapse,Pneumonia (except that caused by tuberculosis or sexually transmitted disease),Respiratory failure; insufficiency; arrest (adult),Septicemia (except in labor),Shock
ICUSTAY_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200001,,,,1.0,1.0,,,,1.0,,...,,1.0,,,,,,,,
200003,,,,,,,,,,,...,,,,,,,,,1.0,
200006,,,,,,,1.0,,,,...,,,,,,,,1.0,,
200007,,,1.0,1.0,,,1.0,,,1.0,...,,,,,,,,,,
200009,,,,,,,1.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299984,,,,1.0,,,1.0,,1.0,,...,,,,,,,1.0,,,
299987,,,,,,1.0,,,1.0,1.0,...,,,,,,,,,,
299992,,,,,,,,,,,...,,,,,,,,1.0,,
299995,,,,,,,,,,,...,,,,,1.0,,,,,


In [85]:
phenotypes_ = phenotypes.fillna(0).astype(int).sort_index(axis=0).sort_index(axis=1)

In [76]:
phenotypes_

HCUP_CCS_2015,Acute and unspecified renal failure,Acute cerebrovascular disease,Acute myocardial infarction,Cardiac dysrhythmias,Chronic kidney disease,Chronic obstructive pulmonary disease and bronchiectasis,Complications of surgical procedures or medical care,Conduction disorders,Congestive heart failure; nonhypertensive,Coronary atherosclerosis and other heart disease,...,Gastrointestinal hemorrhage,Hypertension with complications and secondary hypertension,Other liver diseases,Other lower respiratory disease,Other upper respiratory disease,Pleurisy; pneumothorax; pulmonary collapse,Pneumonia (except that caused by tuberculosis or sexually transmitted disease),Respiratory failure; insufficiency; arrest (adult),Septicemia (except in labor),Shock
ICUSTAY_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200001,0,0,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
200003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
200006,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
200007,0,0,1,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
200009,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299987,0,0,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
299992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
299993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
299995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [77]:
import csv
phenotypes_.to_csv(os.path.join(output_path, 'phenotype_labels.csv'),
                                                      index=False, quoting=csv.QUOTE_NONNUMERIC)

# 为每个病人生成一个文件夹

In [86]:
# 得到目前ICUSTAYS的所有病人ID
subjects = icustays.SUBJECT_ID.unique()
""" for s in subjects:
    if s==2 :
        print('*****') """
print(len(subjects))
print(min(subjects))
print(max(subjects))
print(len(icustays.SUBJECT_ID.unique()))
print(len(stays.SUBJECT_ID.unique()))

33798
3
99999
33798
46476


In [58]:
from tqdm import tqdm

def break_up_stays_by_subject(stays, output_path, subjects=subjects):
    # 得到目前ICUSTAYS的所有病人ID
    subjects = stays.SUBJECT_ID.unique() if subjects is None else subjects
    nb_subjects = subjects.shape[0]
    # 为每一个病人生成一个文件夹
    for subject_id in tqdm(subjects, total=nb_subjects, desc='Breaking up stays by subjects'):
        dn = os.path.join(output_path, str(subject_id))
        try:
            os.makedirs(dn)
        except:
            pass
        # 取出该病人的所有ICU记录
        # 按照进入ICU的时间排序
        stays[stays.SUBJECT_ID == subject_id].sort_values(by='INTIME').to_csv(os.path.join(dn, 'stays.csv'),
                                                                              index=False)

In [56]:
def break_up_diagnoses_by_subject(diagnoses, output_path, subjects=subjects):
    subjects = diagnoses.SUBJECT_ID.unique() if subjects is None else subjects
    nb_subjects = subjects.shape[0]
    for subject_id in tqdm(subjects, total=nb_subjects, desc='Breaking up diagnoses by subjects'):
        dn = os.path.join(output_path, str(subject_id))
        try:
            os.makedirs(dn)
        except:
            pass

        diagnoses[diagnoses.SUBJECT_ID == subject_id].sort_values(by=['ICUSTAY_ID', 'SEQ_NUM'])\
                                                     .to_csv(os.path.join(dn, 'diagnoses.csv'), index=False)


In [59]:
break_up_stays_by_subject(icustays, output_path, subjects=subjects)
break_up_diagnoses_by_subject(diagnoses, output_path, subjects=subjects)

Breaking up stays by subjects: 100%|██████████| 33798/33798 [04:39<00:00, 120.75it/s]
Breaking up diagnoses by subjects: 100%|██████████| 33798/33798 [05:08<00:00, 109.47it/s]


# CHARTEVENTS、 LABEVENTS、 OUTPUTEVENTS表格处理
最后主要得到 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CHARTTIME', 'ITEMID', 'VALUE', 'VALUEUOM'
分别表示：病人ID、住院ID、ICU ID、测量时间、测量项ID、测量值、测量值的单位

### 我自己使用pandas写的三个表格的处理

In [87]:
obs_header = ['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CHARTTIME', 'ITEMID', 'VALUE', 'VALUEUOM']
event_tables=['CHARTEVENTS', 'LABEVENTS', 'OUTPUTEVENTS']

### 拆分chartevents表
因为chartevents表太大了，一次难以读入内存，需要拆分

In [91]:
# 分块读取大文件
chartevents_file = os.path.join(mimic3_path, 'chartevent_blocks')
try :
    os.makedirs(chartevents_file)
except:
    pass

chartevents = pd.read_csv(os.path.join(mimic3_path, 'CHARTEVENTS.csv'), chunksize=10000000)
count = 0
for chunk in chartevents:
    count = count+1
    c_path = os.path.join(chartevents_file, f'CHARTEVENTS_{count}.csv')
    chunk.to_csv(c_path, index=False)
    #print(chunk)
    
    

In [66]:
labevents = pd.read_csv(os.path.join(mimic3_path, 'LABEVENTS.csv'))
outputevents = pd.read_csv(os.path.join(mimic3_path, 'OUTPUTEVENTS.csv'))

if 'ICUSTAY_ID' not in labevents:
    labevents['ICUSTAY_ID'] = ''
if 'ICUSTAY_ID' not in outputevents:
    outputevents['ICUSTAY_ID'] = ''
print(labevents, outputevents['ICUSTAY_ID'])

            ROW_ID  SUBJECT_ID   HADM_ID  ITEMID            CHARTTIME VALUE  \
0              281           3       NaN   50820  2101-10-12 16:07:00  7.39   
1              282           3       NaN   50800  2101-10-12 18:17:00   ART   
2              283           3       NaN   50802  2101-10-12 18:17:00    -1   
3              284           3       NaN   50804  2101-10-12 18:17:00    22   
4              285           3       NaN   50808  2101-10-12 18:17:00  0.93   
...            ...         ...       ...     ...                  ...   ...   
27854050  27428435       96443  103219.0   50882  2109-12-30 01:40:00    26   
27854051  27428436       96443  103219.0   50885  2109-12-30 01:40:00   2.1   
27854052  27428437       96443  103219.0   50902  2109-12-30 01:40:00    97   
27854053  27428438       96443  103219.0   50911  2109-12-30 01:40:00     2   
27854054  27428439       96443  103219.0   50912  2109-12-30 01:40:00   1.6   

          VALUENUM VALUEUOM      FLAG ICUSTAY_ID  


In [98]:
flag_dict = {}
for s in subjects:
    flag_dict[s] = True
print(flag_dict)

{268: True, 269: True, 270: True, 272: True, 273: True, 274: True, 275: True, 276: True, 279: True, 281: True, 282: True, 283: True, 284: True, 285: True, 286: True, 287: True, 290: True, 291: True, 292: True, 293: True, 294: True, 295: True, 296: True, 298: True, 301: True, 302: True, 303: True, 304: True, 305: True, 307: True, 308: True, 309: True, 310: True, 313: True, 314: True, 315: True, 317: True, 319: True, 320: True, 321: True, 322: True, 323: True, 324: True, 325: True, 326: True, 327: True, 328: True, 329: True, 330: True, 333: True, 335: True, 338: True, 339: True, 342: True, 344: True, 345: True, 346: True, 347: True, 348: True, 350: True, 351: True, 352: True, 353: True, 65: True, 67: True, 68: True, 71: True, 73: True, 75: True, 77: True, 78: True, 80: True, 81: True, 83: True, 84: True, 85: True, 86: True, 88: True, 94: True, 95: True, 96: True, 97: True, 98: True, 99: True, 100: True, 101: True, 103: True, 105: True, 107: True, 108: True, 109: True, 112: True, 113: Tru

In [100]:
for f in chartevents_dir.glob('*.csv'):
    print(f.name)
    df_chart = pd.read_csv(f)[obs_header]
    for s in subjects:
        print(s)
        mask_s =  df_chart['SUBJECT_ID'] == s
        chart = df_chart[mask_s]
        if not chart.empty:
            if  flag_dict[s]:
                chart.to_csv(fn, index=False)
                flag_dict[s] = False
            else:
                chart.to_csv(fn, mode='a', header=False, index=False)
        else:
            continue



CHARTEVENTS_1.csv
268
269
270
272
273
274
275
276
279
281
282
283
284
285
286
287
290
291
292
293
294
295
296
298
301
302
303
304
305
307
308
309
310
313
314
315
317
319
320
321
322
323
324
325
326
327
328
329
330
333
335
338
339
342
344
345
346
347
348
350
351
352
353
65
67
68
71
73
75
77
78
80
81
83
84
85
86
88
94
95
96
97
98
99
100
101
103
105
107
108
109
112
113
114
115
117
119
123
124
125
127
129
130
132
133
134
135
136
138
140
141
142
143
144
145
147
148
149
150
152
154
155
156
157
158
160
161
162
163
164
165
169
170
171
172
173
174
175
176
177
178
179
181
182
183
184
186
187
188
189
191
192
194
197
198
199
200
201
202
203
205
209
210
211
212
213
217
218
221
222
224
225
228
231
234
235
236
3
4
6
9
11
12
17
18
19
20
21
22
23
24
25
26
28
30
31
32
33
34
35
36
37
42
43
44
45
46
49
52
53
55
56
59
61
62
63
64
606
608
609
612
613
614
616
617
618
620
622
624
625
627
628
629
630
631
634
635
636
638
639
642
644
650
651
653
654
655
657
238
240
241
242
243
245
246
247
248
251
252
253
255
256

KeyboardInterrupt: 

In [97]:
from pathlib import Path
chartevents_dir = Path(chartevents_file)
# flag为True表示第一次写入，带标题
flag = True
for s in subjects:
    dn = os.path.join(output_path, str(s))
    fn = os.path.join(dn, 'events.csv')
    # 循环处理所有的chartevents_n小文件
    for f in chartevents_dir.glob('*.csv'):
        print(f.name)
        df_chart = pd.read_csv(f)[obs_header]
        mask_chart = df_chart['SUBJECT_ID'] == s
        chart = df_chart[mask_chart]
        if not chart.empty:
            if  flag:
                chart.to_csv(fn, index=False)
                flag = False
            else:
                chart.to_csv(fn, mode='a', header=False, index=False)
        else:
            continue
    # 写入lab和out信息
    mask_lab = labevents['SUBJECT_ID'] == s
    mask_out = outputevents['SUBJECT_ID'] == s
    lab = labevents[mask_lab][obs_header]
    out = outputevents[mask_lab][obs_header]
    lab.to_csv(fn, mode='a', header=False, index=False)
    out.to_csv(fn, mode='a', header=False, index=False)


CHARTEVENTS_1.csv
CHARTEVENTS_10.csv
CHARTEVENTS_11.csv
CHARTEVENTS_12.csv
CHARTEVENTS_13.csv
CHARTEVENTS_14.csv
CHARTEVENTS_15.csv


KeyboardInterrupt: 

### 把labevents和utputevents的信息写入每个病人的events里面

In [75]:
for s in subjects:
    dn = os.path.join(output_path, str(s))
    fn = os.path.join(dn, 'events.csv')

    mask_lab = labevents['SUBJECT_ID'] == s
    mask_out = outputevents['SUBJECT_ID'] == s

    lab = labevents[mask_lab][obs_header]
    out = outputevents[mask_lab][obs_header]
    lab.to_csv(fn, mode='a', header=False, index=False)
    out.to_csv(fn, mode='a', header=False, index=False)

In [76]:
""" print(lab['HADM_ID'])
data = pd.read_csv(fn)
print(data['HADM_ID']) """

13423    129635.0
13424    129635.0
13425    129635.0
13426    129635.0
13427    129635.0
           ...   
15427    129635.0
15428    129635.0
15429    129635.0
15430    129635.0
15431    129635.0
Name: HADM_ID, Length: 315, dtype: float64
0      129635.0
1      129635.0
2      129635.0
3      129635.0
4      129635.0
         ...   
625    173492.0
626    173492.0
627    173492.0
628    173492.0
629    173492.0
Name: HADM_ID, Length: 630, dtype: float64


In [None]:
for  s in subjects:
    dn = os.path.join(output_path, str(s))
    try:
        os.makedirs(dn)
    except:
        pass
    fn = os.path.join(dn, 'events.csv')
    
    mask_lab = labevents['SUBJECT_ID'] == s
    mask_out = outputevents['SUBJECT_ID'] == s

    lab = labevents[mask_lab][obs_header]
    out = outputevents[mask_lab][obs_header]
    lab.to_csv()


### 下面的处理太费时，没办法只能依靠pandas来处理

In [129]:

def read_events_table_and_break_up_by_subject(mimic3_path, table, output_path,
                                              items_to_keep=None, subjects_to_keep=None):
    obs_header = ['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CHARTTIME', 'ITEMID', 'VALUE', 'VALUEUOM']
    if items_to_keep is not None:
        items_to_keep = set([str(s) for s in items_to_keep])
    if subjects_to_keep is not None:
        subjects_to_keep = set([str(s) for s in subjects_to_keep])

    class DataStats(object):
        def __init__(self):
            self.curr_subject_id = ''
            self.curr_obs = []

    data_stats = DataStats()

    def write_current_observations():
        dn = os.path.join(output_path, str(data_stats.curr_subject_id))
        try:
            os.makedirs(dn)
        except:
            pass
        fn = os.path.join(dn, 'events.csv')
        if not os.path.exists(fn) or not os.path.isfile(fn):
            f = open(fn, 'w')
            f.write(','.join(obs_header) + '\n')
            f.close()
        w = csv.DictWriter(open(fn, 'a'), fieldnames=obs_header, quoting=csv.QUOTE_MINIMAL)
        w.writerows(data_stats.curr_obs)
        data_stats.curr_obs = []

    # nb_rows_dict是mimic iii的这三个表分别有多少行数据
    nb_rows_dict = {'chartevents': 330712484, 'labevents': 27854056, 'outputevents': 4349219}
    nb_rows = nb_rows_dict[table.lower()]

    for row, row_no, _ in tqdm(read_events_table_by_row(mimic3_path, table), total=nb_rows,
                                                        desc='Processing {} table'.format(table)):

        # 判断是我们要处理的病人、以及是我们要处理的测量值
        if (subjects_to_keep is not None) and (row['SUBJECT_ID'] not in subjects_to_keep):
            continue
        if (items_to_keep is not None) and (row['ITEMID'] not in items_to_keep):
            continue

        row_out = {'SUBJECT_ID': row['SUBJECT_ID'],
                   'HADM_ID': row['HADM_ID'],
                   'ICUSTAY_ID': '' if 'ICUSTAY_ID' not in row else row['ICUSTAY_ID'],
                   'CHARTTIME': row['CHARTTIME'],
                   'ITEMID': row['ITEMID'],
                   'VALUE': row['VALUE'],
                   'VALUEUOM': row['VALUEUOM']}
        """ 
        如果当前处理的病人和上一次处理的病人不是同一个人时: 
        说明是新开始处理另外一个病人了，需要把上一个病人的信息写入文件
        """
        if data_stats.curr_subject_id != '' and data_stats.curr_subject_id != row['SUBJECT_ID']:
            write_current_observations()
        # 否则就把数据直接添加到同一个病人的list里面
        data_stats.curr_obs.append(row_out)
        data_stats.curr_subject_id = row['SUBJECT_ID']

    # 这是处理最后一个数据的时候
    if data_stats.curr_subject_id != '':
        write_current_observations()
        


In [130]:
for table in event_tables:
    read_events_table_and_break_up_by_subject(mimic3_path, table, output_path, items_to_keep=items_to_keep,
                                              subjects_to_keep=subjects)

Processing CHARTEVENTS table:  16%|█▌        | 52515127/330712484 [1:20:30<7:06:28, 10872.02it/s]


KeyboardInterrupt: 

### 得到所有的itemid

In [78]:
pat_idx = np.random.choice(patients.shape[0], size=1000)
pats = patients.iloc[pat_idx]
sts = icustays.merge(pats[['SUBJECT_ID']], left_on='SUBJECT_ID', right_on='SUBJECT_ID')

event_tables=['CHARTEVENTS', 'LABEVENTS', 'OUTPUTEVENTS']
event_table = [event_tables[0]]

In [82]:
print('Using only', sts.shape[0], 'stays and only', event_tables[0], 'table')

Using only 898 stays and only CHARTEVENTS table


In [86]:
subjects = sts.SUBJECT_ID.unique() if subjects is None else subjects
nb_subjects = subjects.shape[0]