In [1]:
import numpy as np
import dill
import pandas as pd

import datetime as dt
from tqdm import tqdm
from collections import defaultdict, Counter

import warnings
# 禁用所有警告
warnings.filterwarnings("ignore")

In [12]:
df = pd.read_csv('output/III_patients.csv')
print(df.shape,len(df.HADM_ID.unique()),len(df.ICUSTAY_ID.unique()))
print('DIEINHOSPITAL:',df['DIEINHOSPITAL'].value_counts())

(32557, 25) 32557 32557
DIEINHOSPITAL: 0    28971
1     3586
Name: DIEINHOSPITAL, dtype: int64


In [13]:
date_columns = ['ADMITTIME', 'DISCHTIME', 'INTIME', 'OUTTIME','DOD']
df[date_columns] = df[date_columns].apply(pd.to_datetime)

## Diagnosis

In [26]:
#note:use MIMIC-IV D_ICD_DIAGNOSES
D_ICD_DIAGNOSES = pd.read_csv('input/d_icd_diagnoses.csv.gz')
D_ICD_DIAGNOSES.columns = D_ICD_DIAGNOSES.columns.str.upper()
D_ICD_DIAGNOSES.columns = ['ICD_CODE','ICD_VERSION','ICD_TEXT']

diag_9_10 = pd.read_csv('input/0_diag_9_10.csv')
diag_9_10.columns = ['ICD_CODE', 'ROOT']
diag_9_10 = diag_9_10[~diag_9_10.duplicated(subset=['ICD_CODE'], keep=False)]
diag_9_10['ROOT'] = diag_9_10['ROOT'].fillna(diag_9_10['ICD_CODE'])
print(diag_9_10.shape,len(diag_9_10.ICD_CODE.unique()))

#included diagnose id
icds = ['Z85', 'J98', 'I25', 'F10', 'J69', 'J45', 'J18', 'E89', 'D69', 'I95', 'I10', 'E66', 'E87', 'R78', 'A40', 'N17', 'F41', 'N18', 'G47', 'E11', 'R09', 'I47', 'K70', '9971', 'Z95', 'I21', 'I12', 'F32', 'I34', 'T78', 'K22', 'I50', 'J44', 'D64', 'R00', 'E78', 'N39', 'K92']
len(icds)

(18554, 2) 18554


38

In [27]:
# MIMIC-III
III_diagnoses_raw = pd.read_csv('input/DIAGNOSES_ICD.csv.gz')

# Set ICD version to 9 for MIMIC-III data
III_diagnoses_raw['ICD_VERSION'] = 9

# Print shape and unique ICD9 codes
III_diagnoses_raw = III_diagnoses_raw[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE', 'ICD_VERSION']]
III_diagnoses_raw.columns = ['SUBJECT_ID', 'HADM_ID', 'ICD_CODE', 'ICD_VERSION']
print(III_diagnoses_raw.shape, len(III_diagnoses_raw.ICD_CODE.unique()))

# Merge raw diagnoses data with the ICD-9 to ICD-10 conversion table
III_diagnoses = pd.merge(
    III_diagnoses_raw,
    diag_9_10,
    on='ICD_CODE',
    how='left'
)

# Fill missing ROOT values with ICD_CODE and remove duplicates
III_diagnoses['ROOT'] = III_diagnoses['ROOT'].fillna(III_diagnoses['ICD_CODE'])
III_diagnoses = III_diagnoses.drop_duplicates(subset=['HADM_ID', 'ICD_CODE', 'ICD_VERSION', 'ROOT'], keep='first')

# Print updated shape, unique ICD codes, and ROOT values
III_diagnoses = III_diagnoses[III_diagnoses['ROOT'].isin(icds)]
print(III_diagnoses.shape, len(III_diagnoses.ICD_CODE.unique()), len(III_diagnoses.ROOT.unique()))
print(list(III_diagnoses.ROOT.unique()))

# Merge with ICD text descriptions for ICD-9 codes
III_diagnoses = pd.merge(
    III_diagnoses,
    D_ICD_DIAGNOSES[D_ICD_DIAGNOSES.ICD_VERSION == 9][['ICD_CODE', 'ICD_TEXT']],
    on='ICD_CODE',
    how='left'
)

# Group by subject and admission ID to list ICD codes per ROOT category
III_diagnoses_result = III_diagnoses.groupby(['SUBJECT_ID', 'HADM_ID', 'ROOT'])['ICD_CODE'].apply(list).reset_index()
# Pivot the grouped data to create a matrix for HADM_ID and ROOT categories
III_diagnoses_result = III_diagnoses_result.pivot(index='HADM_ID', columns='ROOT', values='ICD_CODE')
# Reset the index after pivot
III_diagnoses_result = III_diagnoses_result.reset_index()
# Display the first few rows of the final result
print(III_diagnoses_result.head())

#Check nan
print(III_diagnoses[III_diagnoses.ICD_TEXT.isna()].shape)

III_diagnoses_result.loc[:, III_diagnoses_result.columns != 'HADM_ID'] = III_diagnoses_result.loc[:, III_diagnoses_result.columns != 'HADM_ID'].notna().astype(int)
print(III_diagnoses_result.shape,len(III_diagnoses_result.HADM_ID.unique()))

III_diagnoses_result = III_diagnoses_result[III_diagnoses_result.HADM_ID.isin(df.HADM_ID)]
print(III_diagnoses_result.shape,len(III_diagnoses_result.HADM_ID.unique()))

III_diagnoses_result.columns = ['Diag_' + col if (col != 'HADM_ID')&(col != 'ICUSTAY_ID') else col for col in III_diagnoses_result.columns]
III_diagnoses_result.head()

(651047, 4) 6985
(314105, 5) 484 38
['I12', 'J18', 'N18', 'E87', 'I95', 'D69', 'D64', 'G47', 'R78', 'Z95', 'I21', 'I25', 'K22', 'E78', 'F10', 'I34', 'N17', 'R09', 'N39', 'I50', 'I10', 'Z85', 'K70', 'E11', 'J98', 'A40', 'T78', 'E89', 'J44', '9971', 'I47', 'J45', 'F32', 'F41', 'J69', 'R00', 'K92', 'E66']
ROOT  HADM_ID 9971  A40     D64  D69                                  E11  \
0      100001  NaN  NaN     NaN  NaN  [25013, 25063, 25043, 25053, 25083]   
1      100003  NaN  NaN  [2851]  NaN                                  NaN   
2      100006  NaN  NaN     NaN  NaN                                  NaN   
3      100007  NaN  NaN     NaN  NaN                                  NaN   
4      100009  NaN  NaN  [2859]  NaN                              [25000]   

ROOT      E66     E78     E87  E89  ...     K92     N17     N18  N39     R00  \
0         NaN     NaN     NaN  NaN  ...  [5780]  [5849]  [5853]  NaN     NaN   
1         NaN     NaN     NaN  NaN  ...     NaN     NaN     NaN  NaN     

Unnamed: 0,HADM_ID,Diag_9971,Diag_A40,Diag_D64,Diag_D69,Diag_E11,Diag_E66,Diag_E78,Diag_E87,Diag_E89,...,Diag_K92,Diag_N17,Diag_N18,Diag_N39,Diag_R00,Diag_R09,Diag_R78,Diag_T78,Diag_Z85,Diag_Z95
0,100001,0,0,0,0,1,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
1,100003,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,100006,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,100007,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,100009,0,0,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1


## Procedures

In [16]:
#note:use MIMIC-IV d_icd_procedures
d_icd_procedures = pd.read_csv('input/d_icd_procedures.csv.gz')
d_icd_procedures.columns = ['ICD_CODE', 'ICD_VERSION', 'ICD_TEXT']
print(d_icd_procedures.ICD_VERSION.value_counts())
d_icd_procedures.head(2)

#included Procedures id
icds = [9604, 3722, 3723, 225966, 8856, 224270, 3995, 224385, 3615, 225402, 4513, 8872, 9390, 9904, 3891, 3893, 225401, 3897, 9915, 224264, 966, 9671, 9672, 221217, 3961, 221214, 225792, 225454, 225752, 227194, 3324]
len(icds)

10    82535
9      3888
Name: ICD_VERSION, dtype: int64


31

In [17]:
III_p_raw = pd.read_csv('input/PROCEDURES_ICD.csv.gz')
III_p_raw = III_p_raw[['SUBJECT_ID', 'HADM_ID','ICD9_CODE']]
III_p_raw.columns = ['SUBJECT_ID', 'HADM_ID','ICD_CODE']
III_p_raw = III_p_raw.drop_duplicates(keep='first')
print('III_p_raw',III_p_raw.shape,len(III_p_raw.HADM_ID.unique()))

III_p_raw_result = III_p_raw[III_p_raw.ICD_CODE.isin(icds)][['HADM_ID','ICD_CODE']].pivot_table(index='HADM_ID', columns='ICD_CODE', aggfunc='size', fill_value=np.nan)
III_p_raw_result = III_p_raw_result.reset_index()

III_p_raw_result.loc[:, III_p_raw_result.columns != 'HADM_ID'] = III_p_raw_result.loc[:, III_p_raw_result.columns != 'HADM_ID'].notna().astype(int)
print('III_p_raw_result',III_p_raw_result.shape,len(III_p_raw_result.HADM_ID.unique()))

III_p_raw_result = III_p_raw_result[III_p_raw_result.HADM_ID.isin(df.HADM_ID)]
print('III_p_raw_result',III_p_raw_result.shape,len(III_p_raw_result.HADM_ID.unique()))

event_p = set(icds)-set(III_p_raw_result.columns)
print('Add procedure from event:',len(event_p))
proc = pd.read_csv('input/PROCEDUREEVENTS_MV.csv.gz')
print(proc.shape,len(proc.ICUSTAY_ID.unique()))
proc = proc[['ICUSTAY_ID', 'STARTTIME', 'ENDTIME','ITEMID', 'VALUE','ORDERCATEGORYNAME']]
proc = proc[proc['ICUSTAY_ID'].isin(df['ICUSTAY_ID'].unique())]
print(proc.shape,len(proc.ICUSTAY_ID.unique()))
proc = proc[proc['ITEMID'].isin(event_p)]
proc = proc.dropna(subset=['VALUE'])
print(proc.shape,len(proc.ICUSTAY_ID.unique()))

proc['STARTTIME'] = pd.to_datetime(proc['STARTTIME'])
proc['ENDTIME'] = pd.to_datetime(proc['ENDTIME'])
proc = pd.merge(proc,df[['ICUSTAY_ID', 'INTIME', 'OUTTIME']],on='ICUSTAY_ID',how='left')
within_criteria = proc['STARTTIME'].between(proc['INTIME'],proc['OUTTIME'])
within = proc[within_criteria]
print(within.shape)

within = within[['ICUSTAY_ID','ITEMID']]
within = within.drop_duplicates(keep='first')
print(within.shape)

within_result = within.pivot_table(index='ICUSTAY_ID', columns='ITEMID', aggfunc='size', fill_value=np.nan)
within_result = within_result.reset_index()

within_result.loc[:, within_result.columns != 'ICUSTAY_ID'] = within_result.loc[:, within_result.columns != 'ICUSTAY_ID'].notna().astype(int)
print('within_result',within_result.shape,len(within_result.ICUSTAY_ID.unique()))

within_result = within_result[within_result.ICUSTAY_ID.isin(df.ICUSTAY_ID)]
print('within_result',within_result.shape,len(within_result.ICUSTAY_ID.unique()))

final_p = df[['HADM_ID','ICUSTAY_ID']]
final_p = pd.merge(final_p,III_p_raw_result,on='HADM_ID',how='left')
final_p = pd.merge(final_p,within_result,on='ICUSTAY_ID',how='left')
print(final_p.shape,len(final_p.HADM_ID.unique()),len(final_p.ICUSTAY_ID.unique()))

final_p = final_p[~final_p[final_p.columns.difference(['HADM_ID', 'ICUSTAY_ID'])].isna().all(axis=1)]
final_p = final_p.fillna(0)
print(final_p.shape,len(final_p.HADM_ID.unique()),len(final_p.ICUSTAY_ID.unique()))

final_p.columns = final_p.columns.astype(str)
final_p.columns = ['Pro_' + col if (col != 'HADM_ID')&(col != 'ICUSTAY_ID') else col for col in final_p.columns]
final_p = final_p.drop('HADM_ID', axis=1)
final_p.head()

III_p_raw (228679, 3) 52243
III_p_raw_result (38568, 20) 38568
III_p_raw_result (24525, 20) 24525
Add procedure from event: 12
(258066, 25) 23402
(175106, 6) 13485
(62110, 6) 11470
(61823, 8)
(44244, 2)
within_result (11463, 13) 11463
within_result (11463, 13) 11463
(32557, 33) 32557 32557
(27340, 33) 27340 27340


Unnamed: 0,ICUSTAY_ID,Pro_966,Pro_3324,Pro_3615,Pro_3722,Pro_3723,Pro_3891,Pro_3893,Pro_3897,Pro_3961,...,Pro_224264,Pro_224270,Pro_224385,Pro_225401,Pro_225402,Pro_225454,Pro_225752,Pro_225792,Pro_225966,Pro_227194
0,211552.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,294638.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,228232.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,220597.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,232669.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Drugs

In [18]:
druginfo = pd.read_csv('input/drugbank_drugs_info.csv')

rxnorm2atc4 = pd.read_csv('input/RXCUI2atc4.csv')
rxnorm2atc4 = rxnorm2atc4.drop(columns=['YEAR', 'MONTH', 'NDC'])
rxnorm2atc4.drop_duplicates(subset=['RXCUI'], inplace=True)

with open('input/rxnorm2RXCUI.txt', 'r') as f:
    rxnorm2RXCUI = eval(f.read())
    
#included Drugs id
drug_ids = ['A02AC', 'A02BA', 'A02BC', 'A04AA', 'A06AB', 'A06AD', 'A07AA', 'B01AA', 'B01AC', 'C01BD', 'C03CA', 'C07AB', 'C09AA', 'J01MA', 'N02BE', 'N05BA', 'R01AX']
print(len(drug_ids))

17


In [19]:
def ATC3toDrug(med_pd):
    atc3toDrugDict = {}
    for atc3, drugname in med_pd[['ATC3', 'DRUG']].values:
        if atc3 in atc3toDrugDict:
            atc3toDrugDict[atc3].add(drugname)  
        else:
            atc3toDrugDict[atc3] = {drugname}  

    return atc3toDrugDict

def atc3toSMILES(ATC3toDrugDict, druginfo):
    drug2smiles = {}
    atc3tosmiles = {}
    for drugname, smiles in druginfo[['name', 'moldb_smiles']].values:
        if type(smiles) == type('a'):
            drug2smiles[drugname] = smiles
    for atc3, drug in ATC3toDrugDict.items():
        temp = []
        for d in drug:
            try:
                temp.append(drug2smiles[d])
            except:
                pass
        if len(temp) > 0:
            atc3tosmiles[atc3] = temp[:3]
    
    return atc3tosmiles

In [20]:
III_PRESCRIPTIONS = pd.read_csv('input/PRESCRIPTIONS.csv.gz', dtype={'NDC':'category'})
print(III_PRESCRIPTIONS.shape)
III_PRESCRIPTIONS = III_PRESCRIPTIONS[III_PRESCRIPTIONS.HADM_ID.isin(df.HADM_ID)]
print(III_PRESCRIPTIONS.shape)

III_PRESCRIPTIONS.STARTDATE = pd.to_datetime(III_PRESCRIPTIONS.STARTDATE)
III_PRESCRIPTIONS.ENDDATE = pd.to_datetime(III_PRESCRIPTIONS.ENDDATE)

III_PRESCRIPTIONS_notna = III_PRESCRIPTIONS[~III_PRESCRIPTIONS['ICUSTAY_ID'].isna()]
III_PRESCRIPTIONS_notna = III_PRESCRIPTIONS_notna[III_PRESCRIPTIONS_notna.ICUSTAY_ID.isin(df.ICUSTAY_ID)]
print(III_PRESCRIPTIONS_notna.shape)

III_PRESCRIPTIONS_isna = III_PRESCRIPTIONS[III_PRESCRIPTIONS['ICUSTAY_ID'].isna()]
III_PRESCRIPTIONS_isna = III_PRESCRIPTIONS_isna.drop(['ICUSTAY_ID'], axis=1)
III_PRESCRIPTIONS_isna = pd.merge(III_PRESCRIPTIONS_isna, df[['HADM_ID','ICUSTAY_ID']], on='HADM_ID', how='left')
III_PRESCRIPTIONS_isna = III_PRESCRIPTIONS_isna[III_PRESCRIPTIONS_isna.ICUSTAY_ID.isin(df.ICUSTAY_ID)]
print(III_PRESCRIPTIONS_isna.shape)

III_PRESCRIPTIONS = pd.concat([III_PRESCRIPTIONS_notna,III_PRESCRIPTIONS_isna])
III_PRESCRIPTIONS = III_PRESCRIPTIONS[III_PRESCRIPTIONS.ICUSTAY_ID.isin(df.ICUSTAY_ID)]
print(III_PRESCRIPTIONS.shape)

III_PRESCRIPTIONS = pd.merge(III_PRESCRIPTIONS, df.drop(['HADM_ID'], axis=1), on='ICUSTAY_ID', how='left')
III_PRESCRIPTIONS['STARTDATE_IN_RANGE'] = (III_PRESCRIPTIONS['INTIME'].dt.date <= III_PRESCRIPTIONS['STARTDATE']) & (III_PRESCRIPTIONS['STARTDATE'] <= III_PRESCRIPTIONS['OUTTIME'].dt.date)
III_PRESCRIPTIONS['STARTDATE_afterICU'] = (III_PRESCRIPTIONS['OUTTIME'].dt.date <= III_PRESCRIPTIONS['STARTDATE']) & (III_PRESCRIPTIONS['STARTDATE'] <= III_PRESCRIPTIONS['DISCHTIME'].dt.date)

III_PRESCRIPTIONS = III_PRESCRIPTIONS[III_PRESCRIPTIONS.STARTDATE_afterICU==True]
III_PRESCRIPTIONS = III_PRESCRIPTIONS[['HADM_ID','ICUSTAY_ID','DRUG','NDC']]
print(III_PRESCRIPTIONS.shape)

# Step 1: check nan NDC
drugs_with_non_empty_ndc = III_PRESCRIPTIONS.dropna(subset=['NDC']).groupby('DRUG').filter(lambda x: x['NDC'].notna().any())
III_PRESCRIPTIONS = III_PRESCRIPTIONS[III_PRESCRIPTIONS['DRUG'].isin(drugs_with_non_empty_ndc['DRUG'])]

# Step 2: group DRUG
most_common_ndc = III_PRESCRIPTIONS.dropna(subset=['NDC']).groupby('DRUG')['NDC'].agg(lambda x: x.mode()[0])

# Step 3: fillna
III_PRESCRIPTIONS['NDC'] = III_PRESCRIPTIONS.apply(lambda row: most_common_ndc[row['DRUG']] if pd.isna(row['NDC']) else row['NDC'], axis=1)

III_PRESCRIPTIONS = III_PRESCRIPTIONS.reset_index(drop=True)
III_PRESCRIPTIONS = III_PRESCRIPTIONS[~(III_PRESCRIPTIONS.NDC=='0')]
III_PRESCRIPTIONS = III_PRESCRIPTIONS.drop_duplicates(keep='first')
III_PRESCRIPTIONS = III_PRESCRIPTIONS.reset_index(drop=True)
print(III_PRESCRIPTIONS.shape,len(III_PRESCRIPTIONS.HADM_ID.unique()),len(III_PRESCRIPTIONS.ICUSTAY_ID.unique()))

III_PRESCRIPTIONS = III_PRESCRIPTIONS[['ICUSTAY_ID','DRUG','NDC']]
III_PRESCRIPTIONS = III_PRESCRIPTIONS.drop_duplicates(keep='first')
print(III_PRESCRIPTIONS.shape)

III_PRESCRIPTIONS['RXCUI'] = III_PRESCRIPTIONS['NDC'].map(rxnorm2RXCUI)
III_PRESCRIPTIONS.dropna(inplace=True)
print(III_PRESCRIPTIONS.shape)

III_PRESCRIPTIONS = III_PRESCRIPTIONS[~(III_PRESCRIPTIONS.RXCUI == '')]
III_PRESCRIPTIONS['RXCUI'] = III_PRESCRIPTIONS['RXCUI'].astype('int64')
III_PRESCRIPTIONS = III_PRESCRIPTIONS.reset_index(drop=True)
print(III_PRESCRIPTIONS.shape)

III_PRESCRIPTIONS = III_PRESCRIPTIONS.merge(rxnorm2atc4, on=['RXCUI'])
III_PRESCRIPTIONS = III_PRESCRIPTIONS.rename(columns={'ATC4': 'ATC3'})
print(III_PRESCRIPTIONS.shape)

III_PRESCRIPTIONS = III_PRESCRIPTIONS[III_PRESCRIPTIONS.ATC3.isin(drug_ids)]
print(III_PRESCRIPTIONS.shape,len(III_PRESCRIPTIONS.ATC3.unique()),len(III_PRESCRIPTIONS.NDC.unique()))

III_atc3toDrug = ATC3toDrug(III_PRESCRIPTIONS)
print(len(III_atc3toDrug))

III_SMIL = atc3toSMILES(III_atc3toDrug,druginfo)
print(len(III_SMIL))

III_med_pd = III_PRESCRIPTIONS[III_PRESCRIPTIONS.ATC3.isin(III_SMIL.keys())]
print(III_med_pd.shape,len(III_med_pd.ATC3.unique()),len(III_med_pd.NDC.unique()))
III_med_pd['SMILES'] = III_med_pd['ATC3'].map(lambda x: '\t'.join(III_SMIL[x]))
print(III_med_pd.shape,len(III_med_pd.ATC3.unique()),len(III_med_pd.NDC.unique()))

III_m = III_med_pd[['ICUSTAY_ID','ATC3']]
III_m = III_m.drop_duplicates(keep='first')

III_m = III_m.pivot_table(index='ICUSTAY_ID', columns='ATC3', aggfunc='size', fill_value=np.nan)
III_m = III_m.reset_index()

III_m.loc[:, III_m.columns != 'ICUSTAY_ID'] = III_m.loc[:, III_m.columns != 'ICUSTAY_ID'].notna().astype(int)
print(III_m.shape)

III_m.columns = ['Drug_' + col if (col != 'HADM_ID')&(col != 'ICUSTAY_ID') else col for col in III_m.columns]
III_m.head()

(4156450, 19)
(2831184, 19)
(1805822, 19)
(909406, 19)
(2715228, 19)
(782597, 4)
(457345, 4) 28167 28167
(457345, 3)
(457345, 4)
(453549, 4)
(360734, 5)
(145875, 5) 17 302
17
17
(145875, 5) 17 302
(145875, 6) 17 302
(25420, 18)


Unnamed: 0,ICUSTAY_ID,Drug_A02AC,Drug_A02BA,Drug_A02BC,Drug_A04AA,Drug_A06AB,Drug_A06AD,Drug_A07AA,Drug_B01AA,Drug_B01AC,Drug_C01BD,Drug_C03CA,Drug_C07AB,Drug_C09AA,Drug_J01MA,Drug_N02BE,Drug_N05BA,Drug_R01AX
0,200003.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,200007.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,200009.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
3,200014.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
4,200021.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Merge

In [21]:
print(III_diagnoses_result.columns)
print(final_p.columns)
print(III_m.columns)

Index(['HADM_ID', 'Diag_9971', 'Diag_A40', 'Diag_D64', 'Diag_D69', 'Diag_E11',
       'Diag_E66', 'Diag_E78', 'Diag_E87', 'Diag_E89', 'Diag_F10', 'Diag_F32',
       'Diag_F41', 'Diag_G47', 'Diag_I10', 'Diag_I12', 'Diag_I21', 'Diag_I25',
       'Diag_I34', 'Diag_I47', 'Diag_I50', 'Diag_I95', 'Diag_J18', 'Diag_J44',
       'Diag_J45', 'Diag_J69', 'Diag_J98', 'Diag_K22', 'Diag_K70', 'Diag_K92',
       'Diag_N17', 'Diag_N18', 'Diag_N39', 'Diag_R00', 'Diag_R09', 'Diag_R78',
       'Diag_T78', 'Diag_Z85', 'Diag_Z95'],
      dtype='object')
Index(['ICUSTAY_ID', 'Pro_966', 'Pro_3324', 'Pro_3615', 'Pro_3722', 'Pro_3723',
       'Pro_3891', 'Pro_3893', 'Pro_3897', 'Pro_3961', 'Pro_3995', 'Pro_4513',
       'Pro_8856', 'Pro_8872', 'Pro_9390', 'Pro_9604', 'Pro_9671', 'Pro_9672',
       'Pro_9904', 'Pro_9915', 'Pro_221214', 'Pro_221217', 'Pro_224264',
       'Pro_224270', 'Pro_224385', 'Pro_225401', 'Pro_225402', 'Pro_225454',
       'Pro_225752', 'Pro_225792', 'Pro_225966', 'Pro_227194'],
      dt

In [22]:
Final_III = pd.merge(df,III_diagnoses_result,on='HADM_ID',how='left')
Final_III = pd.merge(Final_III,final_p,on='ICUSTAY_ID',how='left')
Final_III = pd.merge(Final_III,III_m,on='ICUSTAY_ID',how='left')
print(Final_III.shape,len(Final_III.HADM_ID.unique()),len(Final_III.ICUSTAY_ID.unique()))

Final_III = Final_III[~Final_III[Final_III.columns.difference(df.columns)].isna().all(axis=1)]
Final_III = Final_III.fillna(0)
print(Final_III.shape,len(Final_III.HADM_ID.unique()),len(Final_III.ICUSTAY_ID.unique()))
Final_III['DIEINHOSPITAL'].value_counts()

(32557, 111) 32557 32557
(32412, 111) 32412 32412


0    28826
1     3586
Name: DIEINHOSPITAL, dtype: int64

In [23]:
Final_III.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,GENDER,ADMITTIME,INTIME,DISCHTIME,OUTTIME,DOB,DOD,...,Drug_B01AA,Drug_B01AC,Drug_C01BD,Drug_C03CA,Drug_C07AB,Drug_C09AA,Drug_J01MA,Drug_N02BE,Drug_N05BA,Drug_R01AX
0,3,145834,211552.0,M,2101-10-20 19:08:00,2101-10-20 19:10:11,2101-10-31 13:58:00,2101-10-26 20:43:09,2025-04-11,2102-06-14 00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,185777,294638.0,F,2191-03-16 00:28:00,2191-03-16 00:29:31,2191-03-23 18:41:00,2191-03-17 16:46:31,2143-05-12,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,107064,228232.0,F,2175-05-30 07:15:00,2175-05-30 21:30:54,2175-06-15 16:00:00,2175-06-03 13:39:54,2109-06-21,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,150750,220597.0,M,2149-11-09 13:06:00,2149-11-09 13:07:02,2149-11-14 10:15:00,2149-11-14 20:52:14,2108-01-26,2149-11-14 00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11,194540,229441.0,F,2178-04-16 06:18:00,2178-04-16 06:19:32,2178-05-11 19:00:00,2178-04-17 20:21:05,2128-02-22,2178-11-14 00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [24]:
Final_III.to_csv('output/Final_III_0.csv',index=False)