In [11]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import pickle as pkl
import gzip
import os.path

DATA_DIR = '../data/'

In [2]:
df_diag = (pd.read_csv(f'{DATA_DIR}DIAGNOSES_ICD.csv.gz')
                    .reset_index(drop=True)[['SUBJECT_ID','HADM_ID','ICD9_CODE']])
print(df_diag.shape)
print(df_diag.head(2))

(651047, 3)
   SUBJECT_ID  HADM_ID ICD9_CODE
0         109   172335     40301
1         109   172335       486


In [3]:
df_diag_diabetes = (df_diag
                    .dropna()
                    .query("ICD9_CODE.str.startswith('250')")
                    .groupby('HADM_ID')
                    ['ICD9_CODE']
                    .unique()
                    .reset_index())
df_diag_diabetes = df_diag_diabetes['HADM_ID']
print(df_diag_diabetes.head(2))

0    100001
1    100009
Name: HADM_ID, dtype: int64


In [4]:
df_diag_diabetes_rel = pd.merge(df_diag, df_diag_diabetes, on=['HADM_ID'], how='inner')
df_diag_diabetes_rel['ICD9_CODE_ROLLED'] = df_diag_diabetes_rel['ICD9_CODE'].apply(lambda x: x[0:3])
df_icd9_list = df_diag_diabetes_rel['ICD9_CODE'].unique()
df_icd9_list.sort()
df_icd9_rolled_list = df_diag_diabetes_rel['ICD9_CODE_ROLLED'].unique()
df_icd9_rolled_list.sort()

print(df_diag_diabetes_rel.shape)
print(df_diag_diabetes_rel.head(2))
print('Num. of unique ICD9 codes: ', len(df_icd9_list))
print('Num. of unique rolled-up ICD9 codes: ', len(df_icd9_rolled_list))

(199964, 4)
   SUBJECT_ID  HADM_ID ICD9_CODE ICD9_CODE_ROLLED
0         117   140784      5715              571
1         117   140784      7895              789
Num. of unique ICD9 codes:  4103
Num. of unique rolled-up ICD9 codes:  781


In [9]:
# Write unique ICD lists to file
pkl.dump( df_icd9_list, open( f'{DATA_DIR}diag_icd9_unique_list.p', "wb" ) )
pkl.dump( df_icd9_rolled_list, open( f'{DATA_DIR}diag_icd9_rolled_unique_list.p', "wb" ) )

In [5]:
df_diag_icd9 = (df_diag_diabetes_rel
                .groupby('HADM_ID')
                ['ICD9_CODE']
                .unique()
                .reset_index())
print(df_diag_icd9.shape)
print(df_diag_icd9.head(2))

(14222, 2)
   HADM_ID                                          ICD9_CODE
0   100001  [25013, 3371, 5849, 5780, V5867, 25063, 5363, ...
1   100009  [41401, 99604, 4142, 25000, 27800, V8535, 4148...


In [97]:
df_diag_icd9_rolled = (df_diag_diabetes_rel
                .groupby('HADM_ID')
                ['ICD9_CODE_ROLLED']
                .unique()
                .reset_index())
print(df_diag_icd9_rolled.shape)
print(df_diag_icd9_rolled.head(2))

(14222, 2)
   HADM_ID                                   ICD9_CODE_ROLLED
0   100001  [250, 337, 584, 578, V58, 536, 458, 403, 585, ...
1   100009  [414, 996, 250, 278, V85, 411, V45, V15, 285, ...


In [160]:
icd9_mlb = MultiLabelBinarizer(classes=df_icd9_list).fit(df_diag_icd9['ICD9_CODE'])

df_diag_icd9['ICD9_CODE_MLB'] = [x for x in icd9_mlb.transform(df_diag_icd9['ICD9_CODE'])]
print(df_diag_icd9.head(2))

   HADM_ID                                          ICD9_CODE  \
0   100001  [25013, 3371, 5849, 5780, V5867, 25063, 5363, ...   
1   100009  [41401, 99604, 4142, 25000, 27800, V8535, 4148...   

                                       ICD9_CODE_MLB  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  


In [161]:
icd9_rolled_mlb = MultiLabelBinarizer(classes=df_icd9_rolled_list).fit(df_diag_icd9_rolled['ICD9_CODE_ROLLED'])

df_diag_icd9_rolled['ICD9_CODE_ROLLED_MLB'] = [x for x in icd9_rolled_mlb.transform(df_diag_icd9_rolled['ICD9_CODE_ROLLED'])]
print(df_diag_icd9_rolled.head(2))

   HADM_ID                                   ICD9_CODE_ROLLED  \
0   100001  [250, 337, 584, 578, V58, 536, 458, 403, 585, ...   
1   100009  [414, 996, 250, 278, V85, 411, V45, V15, 285, ...   

                                ICD9_CODE_ROLLED_MLB  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  


In [10]:
df_diag_icd9 = pkl.load(open(f'{DATA_DIR}diag_icd9.p','rb'))
print('df_diag_icd9.shape:                ', df_diag_icd9.shape)

df_diag_icd9_rolled = pkl.load(open(f'{DATA_DIR}diag_icd9_rolled.p','rb'))
print('df_diag_icd9_rolled.shape:         ', df_diag_icd9_rolled.shape)

df_diag_icd9.shape:                 (14222, 3)
df_diag_icd9_rolled.shape:          (14222, 3)


In [13]:
diag_icd9_file = f'{DATA_DIR}diag_icd9.csv.gz'
diag_icd9_rolled_file = f'{DATA_DIR}diag_icd9_rolled.csv.gz'

if os.path.exists(diag_icd9_file):
  print ('reading from saved file diag_icd9_file: ', diag_icd9_file)
  with gzip.open(diag_icd9_file, "rb") as f:
      df_diag_icd9 = pkl.load(f)
  print('df_diag_icd9.type: ',type(df_diag_icd9))
  print('df_diag_icd9.shape: ', df_diag_icd9.shape)
else:
  # save data and label to file
  with gzip.open(diag_icd9_file, "wb") as f:
      pkl.dump(df_diag_icd9, f)
  print('df_diag_icd9 saved')

if os.path.exists(diag_icd9_rolled_file):
  print ('reading from saved file diag_icd9_rolled_file: ', diag_icd9_rolled_file)
  with gzip.open(diag_icd9_rolled_file, "rb") as f:
      df_diag_icd9_rolled = pkl.load(f)
  print('df_diag_icd9_rolled.type: ',type(df_diag_icd9_rolled))
  print('df_diag_icd9_rolled.shape: ', df_diag_icd9_rolled.shape)
else:
  # save data and label to file
  with gzip.open(diag_icd9_rolled_file, "wb") as f:
      pkl.dump(df_diag_icd9_rolled, f)
  print('df_diag_icd9_rolled saved')

# write to file
# df_diag_icd9_rolled.to_pickle(f'{DATA_DIR}diag_icd9_rolled.p')
# df_diag_icd9.to_pickle(f'{DATA_DIR}diag_icd9.p')

reading from saved file diag_icd9_file:  ../data/diag_icd9.csv.gz
df_diag_icd9.type:  <class 'pandas.core.frame.DataFrame'>
df_diag_icd9.shape:  (14222, 3)
reading from saved file diag_icd9_rolled_file:  ../data/diag_icd9_rolled.csv.gz
df_diag_icd9_rolled.type:  <class 'pandas.core.frame.DataFrame'>
df_diag_icd9_rolled.shape:  (14222, 3)


In [164]:
# write to file

# df_diag_diabetes.to_pickle(f'{DATA_DIR}diag_diabetes_hadm_ids.p')