In [162]:
import os
import pandas as pd
import numpy as np

from pathlib import Path


In [2]:
mimic3_path = 'MIMIC-III Clinical Database 1.4'
output_path = 'data/root'

In [3]:
try:
    os.makedirs(output_path)
except:
    pass

## 处理患者的ICUSTAY、PATIENT、ADMISSION表，根据住院期间是否转换过ICU病房、年龄是否大于18等过滤一部分信息；
## 计算患者在ICU期间以及住院期间是否死亡

从数据库中取出来的数据，一般是object，先转成datetime格式

dt.date: the date part of Timestamps without timezone information

In [137]:
patients = pd.read_csv(os.path.join(mimic3_path, 'PATIENTS.csv'))
# DOB is the date of birth of the given patient。 年龄大于89的这个数据会变化
# DOD is the date of death for the given patient
patients = patients[['SUBJECT_ID', 'GENDER', 'DOB', 'DOD']]
print(patients.dtypes)
patients.DOB = pd.to_datetime(pats.DOB).dt.date
patients.DOD = pd.to_datetime(pats.DOD).dt.date
print(patients.head())

SUBJECT_ID     int64
GENDER        object
DOB           object
DOD           object
dtype: object
   SUBJECT_ID GENDER         DOB         DOD
0         249      F  2075-03-13         NaT
1         250      F  2164-12-27  2188-11-22
2         251      M  2090-03-15         NaT
3         252      M  2078-03-06         NaT
4         253      F  2089-11-26         NaT


In [138]:
admits = pd.read_csv(os.path.join(mimic3_path, 'ADMISSIONS.csv'))
admits = admits[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ETHNICITY', 'DIAGNOSIS']]
#print(admits)
admits.ADMITTIME = pd.to_datetime(admits.ADMITTIME).dt.date
admits.DISCHTIME = pd.to_datetime(admits.DISCHTIME).dt.date
admits.DEATHTIME = pd.to_datetime(admits.DEATHTIME).dt.date
print(admits.head())

   SUBJECT_ID  HADM_ID   ADMITTIME   DISCHTIME DEATHTIME ETHNICITY  \
0          22   165315  2196-04-09  2196-04-10       NaT     WHITE   
1          23   152223  2153-09-03  2153-09-08       NaT     WHITE   
2          23   124321  2157-10-18  2157-10-25       NaT     WHITE   
3          24   161859  2139-06-06  2139-06-09       NaT     WHITE   
4          25   129635  2160-11-02  2160-11-05       NaT     WHITE   

                                           DIAGNOSIS  
0                            BENZODIAZEPINE OVERDOSE  
1  CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...  
2                                         BRAIN MASS  
3                     INTERIOR MYOCARDIAL INFARCTION  
4                            ACUTE CORONARY SYNDROME  


In [139]:
stays = pd.read_csv(os.path.join(mimic3_path, 'ICUSTAYS.csv'))
stays.INTIME = pd.to_datetime(stays.INTIME).dt.date
stays.OUTTIME = pd.to_datetime(stays.OUTTIME).dt.date
print(stays.head())

   ROW_ID  SUBJECT_ID  HADM_ID  ICUSTAY_ID DBSOURCE FIRST_CAREUNIT  \
0     365         268   110404      280836  carevue           MICU   
1     366         269   106296      206613  carevue           MICU   
2     367         270   188028      220345  carevue            CCU   
3     368         271   173727      249196  carevue           MICU   
4     369         272   164716      210407  carevue            CCU   

  LAST_CAREUNIT  FIRST_WARDID  LAST_WARDID      INTIME     OUTTIME     LOS  
0          MICU            52           52  2198-02-14  2198-02-18  3.2490  
1          MICU            52           52  2170-11-05  2170-11-08  3.2788  
2           CCU            57           57  2128-06-24  2128-06-27  2.8939  
3          SICU            52           23  2120-08-07  2120-08-10  2.0600  
4           CCU            57           57  2186-12-25  2186-12-27  1.6202  


### 统计最原始的数据个数

In [140]:
print('START:\n\tICUSTAY_IDs: {}\n\tHADM_IDs: {}\n\tSUBJECT_IDs: {}'.format(stays.ICUSTAY_ID.unique().shape[0],
          stays.HADM_ID.unique().shape[0], stays.SUBJECT_ID.unique().shape[0]))


START:
	ICUSTAY_IDs: 61532
	HADM_IDs: 57786
	SUBJECT_IDs: 46476


### remove icustays  with  transfers
去掉在ICU内发生过病房转移（从一个ICU病房转移到另一个ICU病房 或 ICU护理类型发生过变化的患者

这里的代码逻辑是只有没有发生过病房转移且ICU护理类型没有发生过变化的患者才可以保留下来

In [100]:
mask1 = stays.FIRST_WARDID == stays.LAST_WARDID
mask2 = stays.FIRST_CAREUNIT == stays.LAST_CAREUNIT
""" print(mask1)
print(mask2)
print(mask1 & mask2) """

' print(mask1)\nprint(mask2)\nprint(mask1 & mask2) '

In [51]:
s = pd.Series([1, 2, 3])
s1 = pd.Series([True, False, False])
s2 = pd.Series([True, False, True])
print(s1 & s2)
print(s[s1 & s2])
print(s[s1 | s2])

0     True
1    False
2    False
dtype: bool
0    1
dtype: int64
0    1
2    3
dtype: int64


In [141]:
stay_no_transfer = stays[(stays.FIRST_WARDID == stays.LAST_WARDID) & (stays.FIRST_CAREUNIT == stays.LAST_CAREUNIT)]
icustays = stay_no_transfer[['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'LAST_CAREUNIT', 'DBSOURCE', 'INTIME', 'OUTTIME', 'LOS']]


In [142]:
print('REMOVE ICU TRANSFERS:\n\tICUSTAY_IDs: {}\n\tHADM_IDs: {}\n\tSUBJECT_IDs: {}'.format(icustays.ICUSTAY_ID.unique().shape[0], icustays.HADM_ID.unique().shape[0], icustays.SUBJECT_ID.unique().shape[0]))

REMOVE ICU TRANSFERS:
	ICUSTAY_IDs: 55830
	HADM_IDs: 52834
	SUBJECT_IDs: 43277


### 合并表的操作

1 合并stays和admits

保留有入住icu记录的住院患者信息

2 合并stays和patients


In [143]:
icustays = icustays.merge(admits, how='inner', left_on=['SUBJECT_ID', 'HADM_ID'], right_on=['SUBJECT_ID', 'HADM_ID'])
print(icustays.columns)
print(icustays.head())


Index(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'LAST_CAREUNIT', 'DBSOURCE',
       'INTIME', 'OUTTIME', 'LOS', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ETHNICITY', 'DIAGNOSIS'],
      dtype='object')
   SUBJECT_ID  HADM_ID  ICUSTAY_ID LAST_CAREUNIT DBSOURCE      INTIME  \
0         268   110404      280836          MICU  carevue  2198-02-14   
1         269   106296      206613          MICU  carevue  2170-11-05   
2         270   188028      220345           CCU  carevue  2128-06-24   
3         272   164716      210407           CCU  carevue  2186-12-25   
4         273   158689      241507          MICU  carevue  2141-04-19   

      OUTTIME     LOS   ADMITTIME   DISCHTIME   DEATHTIME  \
0  2198-02-18  3.2490  2198-02-11  2198-02-18  2198-02-18   
1  2170-11-08  3.2788  2170-11-05  2170-11-27         NaT   
2  2128-06-27  2.8939  2128-06-23  2128-06-27         NaT   
3  2186-12-27  1.6202  2186-12-25  2187-01-02         NaT   
4  2141-04-20  1.4862  2141-04-19  2141-04-20      

In [144]:
icustays = icustays.merge(patients, how='inner', left_on=['SUBJECT_ID'], right_on=['SUBJECT_ID'])
print(icustays.columns)
print(icustays.head())

Index(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'LAST_CAREUNIT', 'DBSOURCE',
       'INTIME', 'OUTTIME', 'LOS', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ETHNICITY', 'DIAGNOSIS', 'GENDER', 'DOB', 'DOD'],
      dtype='object')
   SUBJECT_ID  HADM_ID  ICUSTAY_ID LAST_CAREUNIT DBSOURCE      INTIME  \
0         268   110404      280836          MICU  carevue  2198-02-14   
1         269   106296      206613          MICU  carevue  2170-11-05   
2         270   188028      220345           CCU  carevue  2128-06-24   
3         272   164716      210407           CCU  carevue  2186-12-25   
4         273   158689      241507          MICU  carevue  2141-04-19   

      OUTTIME     LOS   ADMITTIME   DISCHTIME   DEATHTIME  \
0  2198-02-18  3.2490  2198-02-11  2198-02-18  2198-02-18   
1  2170-11-08  3.2788  2170-11-05  2170-11-27         NaT   
2  2128-06-27  2.8939  2128-06-23  2128-06-27         NaT   
3  2186-12-27  1.6202  2186-12-25  2187-01-02         NaT   
4  2141-04-20  1.4862  2141

### 使用HADM进行分组(hadm相同的计为一组)
REMOVE MULTIPLE STAYS PER ADMIT:\n\tICUSTAY_IDs

In [145]:
# groupby('HADM_ID').count()是把hadm_id当做index，计算其余各列在该index下有多少个取值
to_keep = icustays.groupby('HADM_ID').count()
to_keep = to_keep[['ICUSTAY_ID']]
# 释放index，即把hadm_id重新放到列位置
to_keep = to_keep.reset_index()
mask_min = to_keep.ICUSTAY_ID >= 1
mask_max = to_keep.ICUSTAY_ID <= 1
# to_keep保存的是住院期间只去过一种类型的ICU
to_keep = to_keep[mask_min & mask_max][['HADM_ID']]

In [146]:
icustays = icustays.merge(to_keep, how='inner', on=['HADM_ID'])

In [147]:
print('REMOVE MULTIPLE STAYS PER ADMIT:\n\tICUSTAY_IDs: {}\n\tHADM_IDs: {}\n\tSUBJECT_IDs: {}'.format(icustays.ICUSTAY_ID.unique().shape[0], icustays.HADM_ID.unique().shape[0], icustays.SUBJECT_ID.unique().shape[0]))

REMOVE MULTIPLE STAYS PER ADMIT:
	ICUSTAY_IDs: 50186
	HADM_IDs: 50186
	SUBJECT_IDs: 41587


In [148]:
icustays

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,LAST_CAREUNIT,DBSOURCE,INTIME,OUTTIME,LOS,ADMITTIME,DISCHTIME,DEATHTIME,ETHNICITY,DIAGNOSIS,GENDER,DOB,DOD
0,268,110404,280836,MICU,carevue,2198-02-14,2198-02-18,3.2490,2198-02-11,2198-02-18,2198-02-18,HISPANIC OR LATINO,DYSPNEA,F,2132-02-21,2198-02-18
1,269,106296,206613,MICU,carevue,2170-11-05,2170-11-08,3.2788,2170-11-05,2170-11-27,NaT,WHITE,SEPSIS;PILONIDAL ABSCESS,M,2130-09-30,NaT
2,270,188028,220345,CCU,carevue,2128-06-24,2128-06-27,2.8939,2128-06-23,2128-06-27,NaT,UNKNOWN/NOT SPECIFIED,CAROTID STENOSIS\CAROTID ANGIOGRAM AND STENT,M,2048-05-26,NaT
3,272,164716,210407,CCU,carevue,2186-12-25,2186-12-27,1.6202,2186-12-25,2187-01-02,NaT,WHITE,PULMONARY EMBOLIS,M,2119-11-21,NaT
4,273,158689,241507,MICU,carevue,2141-04-19,2141-04-20,1.4862,2141-04-19,2141-04-20,NaT,BLACK/AFRICAN AMERICAN,POLYSUBSTANCE OVERDOSE,M,2107-08-10,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50181,94944,143774,201233,CSRU,metavision,2104-04-15,2104-04-17,2.1894,2104-04-11,2104-04-20,NaT,WHITE,CHEST PAIN;RULE OUT CORONARY ARTERY DISEASE\LE...,M,2027-03-02,NaT
50182,94950,123750,283653,CCU,metavision,2155-12-08,2155-12-10,2.4942,2155-12-07,2155-12-12,NaT,WHITE,SYNCOPE;TELEMETRY,F,1855-12-07,NaT
50183,94953,196881,241585,SICU,metavision,2160-03-03,2160-03-04,0.9259,2160-03-03,2160-03-04,NaT,WHITE,CEREBRAL ANEURYSM/SDA,F,2107-01-29,2162-01-05
50184,94954,118475,202802,CSRU,metavision,2183-03-25,2183-03-27,2.3346,2183-03-25,2183-04-01,NaT,PATIENT DECLINED TO ANSWER,AORTIC STENOSIS\AORTIC VALVE REPLACEMENT /SDA,F,2115-05-23,NaT


In [155]:
icustays.apply(lambda e: (e['INTIME'] - e['DOB']).days/365, axis=1)

0         66.027397
1         40.126027
2         80.131507
3         67.139726
4         33.715068
            ...    
50181     77.172603
50182    300.202740
50183     53.128767
50184     67.884932
50185     34.241096
Length: 50186, dtype: float64

### 计算病人入住ICU时的年龄

In [156]:
icustays['age'] = icustays.apply(lambda e: (e['INTIME'] - e['DOB']).days/365, axis=1)

#### 筛选出年龄大于等于18岁的病人

In [161]:
import numpy as np
icustays = icustays[(icustays.age >= 18) & (icustays.age <= np.inf)]

In [174]:
print('REMOVE PATIENTS AGE < 18:\n\tICUSTAY_IDs: {}\n\tHADM_IDs: {}\n\tSUBJECT_IDs: {}'.format(icustays.ICUSTAY_ID.unique().shape[0],
        icustays.HADM_ID.unique().shape[0], icustays.SUBJECT_ID.unique().shape[0]))


REMOVE PATIENTS AGE < 18:
	ICUSTAY_IDs: 42276
	HADM_IDs: 42276
	SUBJECT_IDs: 33798


## 添加在住院期间和在ICU期间内死亡的标志

In [166]:
def add_inhospital_mortality_to_icustays(stays):
    """   DOD是从patient表得到的患者死亡日期； DEATHTIME是从admission表得到的患者死亡日期；
        ADMITTIME和DISCHTIME分别为入院和出院时间；
        如果患者的任何一个死亡日期在入院期间内，视为患者在住院期间死亡
    """
    mortality_DOD = stays.DOD.notnull() & ((stays.ADMITTIME <= stays.DOD) & (stays.DISCHTIME >= stays.DOD))
    mortality_DEA = stays.DEATHTIME.notnull() & ((stays.ADMITTIME <= stays.DEATHTIME) & (stays.DISCHTIME >= stays.DEATHTIME))
    mortality_mask = mortality_DOD | mortality_DEA
    # 得到患者在住院期间是否死亡的标志
    stays['MORTALITY'] = mortality_mask.astype(int)
    stays['MORTALITY_INHOSPITAL'] = stays['MORTALITY']
    return stays


In [171]:
def add_inunit_mortality_to_icustays(stays):
    mortality_DOD = stays.DOD.notnull() & ((stays.INTIME <= stays.DOD) & (stays.OUTTIME >= stays.DOD))
    mortality_DEA = stays.DEATHTIME.notnull() & ((stays.INTIME <= stays.DEATHTIME) & (stays.OUTTIME >= stays.DEATHTIME))
    mortality_mask = mortality_DOD | mortality_DEA
    stays['MORTALITY_INUNIT'] = mortality_mask.astype(int)
    return stays

In [172]:
icustays = add_inhospital_mortality_to_icustays(icustays)
icustays = add_inunit_mortality_to_icustays(icustays)

### 把处理好的icustay、patient、admission表存入文件

In [175]:
icustays.to_csv(os.path.join(output_path, 'all_stays.csv'), index=False)

##  读取患者的ICD-9 diagnoses
D_ICD_DIAGNOSES.csv: Definition table for ICD diagnoses(诊断简介)

DIAGNOSES_ICD.csv: Contains ICD diagnoses for patients, most notably ICD-9 diagnoses.(包含患者诊断信息)

In [233]:

codes = pd.read_csv(os.path.join(mimic3_path, 'D_ICD_DIAGNOSES.csv'))
print(codes)


       ROW_ID ICD9_CODE               SHORT_TITLE  \
0         174     01166     TB pneumonia-oth test   
1         175     01170    TB pneumothorax-unspec   
2         176     01171   TB pneumothorax-no exam   
3         177     01172  TB pneumothorx-exam unkn   
4         178     01173  TB pneumothorax-micro dx   
...       ...       ...                       ...   
14562   14432     V7399     Scrn unspcf viral dis   
14563   14433      V740     Screening for cholera   
14564   14434      V741    Screening-pulmonary TB   
14565   14435      V742     Screening for leprosy   
14566   14436      V743  Screening for diphtheria   

                                              LONG_TITLE  
0      Tuberculous pneumonia [any form], tubercle bac...  
1                  Tuberculous pneumothorax, unspecified  
2      Tuberculous pneumothorax, bacteriological or h...  
3      Tuberculous pneumothorax, bacteriological or h...  
4      Tuberculous pneumothorax, tubercle bacilli fou...  
...      

In [199]:
diagnoses = pd.read_csv(os.path.join(mimic3_path, 'DIAGNOSES_ICD.csv'))
diagnoses.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE'], dtype='object')

In [200]:
diagnoses = diagnoses.merge(codes, how='inner', left_on='ICD9_CODE', right_on='ICD9_CODE')

In [191]:
diagnoses.dtypes

ROW_ID           int64
SUBJECT_ID       int64
HADM_ID          int64
SEQ_NUM        float64
ICD9_CODE       object
SHORT_TITLE     object
LONG_TITLE      object
dtype: object

In [201]:
diagnoses[['SUBJECT_ID', 'HADM_ID', 'SEQ_NUM']] = diagnoses[['SUBJECT_ID', 'HADM_ID', 'SEQ_NUM']].astype(int)

In [202]:
diagnoses.dtypes

ROW_ID          int64
SUBJECT_ID      int32
HADM_ID         int32
SEQ_NUM         int32
ICD9_CODE      object
SHORT_TITLE    object
LONG_TITLE     object
dtype: object

#### 把患者的icustay_id加入到diagnosis中

In [203]:
diagnoses = diagnoses.merge(icustays[['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID']].drop_duplicates(), how='inner',
                            on=['SUBJECT_ID', 'HADM_ID'])
print(diagnoses)

        ROW_ID  SUBJECT_ID  HADM_ID  SEQ_NUM ICD9_CODE  \
0         1297         109   172335        1     40301   
1         1298         109   172335        2       486   
2         1299         109   172335        3     58281   
3         1300         109   172335        4      5855   
4         1301         109   172335        5      4254   
...        ...         ...      ...      ...       ...   
477953  205925       18499   125788        1      1922   
477954  348882       30377   104237        1     80336   
477955  348883       30377   104237        2     E8852   
477956  290675       25951   115991        1      1885   
477957  523154       71275   157952        1     99561   

                     SHORT_TITLE  \
0         Mal hyp kid w cr kid V   
1        Pneumonia, organism NOS   
2       Chr nephritis in oth dis   
3       Chron kidney dis stage V   
4        Prim cardiomyopathy NEC   
...                          ...   
477953       Mal neo spinal cord   
477954  Cl skul

In [205]:
diagnoses.to_csv(os.path.join(output_path, 'all_diagnoses.csv'), index=False)

### 去掉ICD-9中信息完全一样的记录（一个也没去掉）

In [234]:
codes = codes.drop_duplicates()

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,01166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,01170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,01171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,01172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,01173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."
...,...,...,...,...
14562,14432,V7399,Scrn unspcf viral dis,Special screening examination for unspecified ...
14563,14433,V740,Screening for cholera,Screening examination for cholera
14564,14434,V741,Screening-pulmonary TB,Screening examination for pulmonary tuberculosis
14565,14435,V742,Screening for leprosy,Screening examination for leprosy (Hansen's di...


In [235]:
ICD_STAY_COUNT = diagnoses.groupby('ICD9_CODE')['ICUSTAY_ID'].count()
ICD_STAY_COUNT = ICD_STAY_COUNT.reset_index()
ICD_STAY_COUNT = ICD_STAY_COUNT.rename(columns={'ICUSTAY_ID':'COUNT'})
print(ICD_STAY_COUNT)

     ICD9_CODE  COUNT
0         0030      2
1         0038      1
2         0039      1
3         0041      3
4         0048      1
...        ...    ...
6164     V8821      3
6165     V9010      1
6166     V9039      1
6167     V9081      3
6168     V9089      1

[6169 rows x 2 columns]


In [249]:
codes = codes.merge(ICD_STAY_COUNT, how='left', on=['ICD9_CODE'])

In [251]:
mask_c = codes.COUNT.notnull()
codes = codes[mask_c]

In [261]:
codes.to_csv(os.path.join(output_path, 'diagnosis_counts.csv'), index =False)

In [260]:
codes.sort_values('COUNT', ascending=False)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE,COUNT
4303,4304,4019,Hypertension NOS,Unspecified essential hypertension,17343.0
4472,4473,4280,CHF NOS,"Congestive heart failure, unspecified",10601.0
4373,4374,41401,Crnry athrscl natve vssl,Coronary atherosclerosis of native coronary ar...,10345.0
4461,4462,42731,Atrial fibrillation,Atrial fibrillation,10313.0
1588,1591,25000,DMII wo cmp nt st uncntr,Diabetes mellitus without mention of complicat...,7486.0
...,...,...,...,...,...
11526,11482,E8156,Mv coll w obj-ped cycl,Other motor vehicle traffic accident involving...,1.0
5600,5602,61800,Vaginal wall prolpse NOS,Unspecified prolapse of vaginal walls,1.0
5604,5606,61804,Rectocele,Rectocele,1.0
5608,5610,6182,Uterovag prolaps-incompl,"Uterovaginal prolapse, incomplete",1.0


In [269]:
os.path.dirname("__file__")

''

In [279]:
phenotype_definitions = os.path.join(os.path.dirname("__file__"), '../resources/hcup_ccs_2015_definitions.yaml')

import yaml
definitions = yaml.load(open(phenotype_definitions, 'r'))


In [None]:
def_map = {}
    for dx in definitions:
        for code in definitions[dx]['codes']:
            def_map[code] = (dx, definitions[dx]['use_in_benchmark'])
    diagnoses['HCUP_CCS_2015'] = diagnoses.ICD9_CODE.apply(lambda c: def_map[c][0] if c in def_map else None)
    diagnoses['USE_IN_BENCHMARK'] = diagnoses.ICD9_CODE.apply(lambda c: int(def_map[c][1]) if c in def_map else None)

In [282]:
def_map = {}
for dx in definitions:
    #print(dx)
    codes_list = definitions[dx]['codes']
    for code in codes_list:
        def_map[code] = (dx, definitions[dx]['use_in_benchmark'])
    

### 25种急性表型分类

In [286]:
diagnoses['HCUP_CCS_2015'] = diagnoses.ICD9_CODE.apply(lambda c: def_map[c][0] if c in def_map else None)
diagnoses['USE_IN_BENCHMARK'] = diagnoses.ICD9_CODE.apply(lambda c: int(def_map[c][1]) if c in def_map else None)

In [291]:
phenotypes = diagnoses
phenotypes = phenotypes[['ICUSTAY_ID', 'HCUP_CCS_2015']].loc[phenotypes.USE_IN_BENCHMARK > 0].drop_duplicates()
phenotypes['VALUE'] = 1

In [292]:
phenotypes = phenotypes.pivot(index='ICUSTAY_ID', columns='HCUP_CCS_2015', values='VALUE')
phenotypes = phenotypes.reindex(icustays.ICUSTAY_ID.sort_values())
phenotypes_ = phenotypes.fillna(0).astype(int).sort_index(axis=0).sort_index(axis=1)

In [296]:
phenotypes = phenotypes.reindex(icustays.ICUSTAY_ID.sort_values())
phenotypes_ = phenotypes.fillna(0).astype(int).sort_index(axis=0).sort_index(axis=1)

In [299]:
phenotypes_.to_csv(os.path.join(output_path, 'phenotype_labels.csv'),
                                                      index=False, quoting=csv.QUOTE_NONNUMERIC)

NameError: name 'csv' is not defined