In [1]:
import os 
import numpy as np
import pandas as pd
import sys
from tqdm import tqdm
from pathlib import Path
import os
import importlib
import warnings

pd.set_option('mode.chained_assignment',  None) 
warnings.simplefilter(action='ignore', category=FutureWarning) 

local = '/Users/DAHS/Desktop/circ_mimic_preprocessing_1day/data'
root_dir = '/Users/DAHS/MIMIC-IV-Data-Pipeline/MIMIC_pipeline/'

In [3]:
def generate_adm():
    data=pd.read_csv(local+"/cohort/cohort_icu_mortality_0_.csv.gz", compression='gzip', header=0, index_col=None)
    data['intime'] = pd.to_datetime(data['intime'])
    data['outtime'] = pd.to_datetime(data['outtime'])
    data['los']=pd.to_timedelta(data['outtime']-data['intime'],unit='h')
    data['los']=data['los'].astype(str)
    data[['days', 'dummy','hours']] = data['los'].str.split(' ', -1, expand=True)
    data[['hours','min','sec']] = data['hours'].str.split(':', -1, expand=True)
    data['los']=pd.to_numeric(data['days'])*24+pd.to_numeric(data['hours'])
    data=data.drop(columns=['days', 'dummy','hours','min','sec'])
    data=data[data['los']>0]
    data['Age']=data['Age'].astype(int)

    return data

def generate_proc():
    proc=pd.read_csv(local+ "/features/preproc_proc_icu.csv.gz", compression='gzip', header=0, index_col=None)
    proc=proc[proc['stay_id'].isin(data['stay_id'])]
    proc[['start_days', 'dummy','start_hours']] = proc['event_time_from_admit'].str.split(' ', -1, expand=True)
    proc[['start_hours','min','sec']] = proc['start_hours'].str.split(':', -1, expand=True)
    proc['start_time']=pd.to_numeric(proc['start_days'])*24+pd.to_numeric(proc['start_hours'])
    proc=proc.drop(columns=['start_days', 'dummy','start_hours','min','sec'])
    proc=proc[proc['start_time']>=0]
    
    ###Remove where event time is after discharge time
    proc=pd.merge(proc,data[['stay_id','los']],on='stay_id',how='left')
    proc['sanity']=proc['los']-proc['start_time']
    proc=proc[proc['sanity']>0]
    del proc['sanity']
    
    return proc


def generate_out():
    out=pd.read_csv(local+ "/features/preproc_out_icu.csv.gz", compression='gzip', header=0, index_col=None)
    out=out[out['stay_id'].isin(data['stay_id'])]
    out[['start_days', 'dummy','start_hours']] = out['event_time_from_admit'].str.split(' ', -1, expand=True)
    out[['start_hours','min','sec']] = out['start_hours'].str.split(':', -1, expand=True)
    out['start_time']=pd.to_numeric(out['start_days'])*24+pd.to_numeric(out['start_hours'])
    out=out.drop(columns=['start_days', 'dummy','start_hours','min','sec'])
    out=out[out['start_time']>=0]
    
    ###Remove where event time is after discharge time
    out=pd.merge(out,data[['stay_id','los']],on='stay_id',how='left')
    out['sanity']=out['los']-out['start_time']
    out=out[out['sanity']>0]
    del out['sanity']
    
    return out
    
    
def generate_chart():
    chunksize = 5000000
    final=pd.DataFrame()
    for chart in tqdm(pd.read_csv(local+ "/features/preproc_chart_icu.csv.gz", compression='gzip', header=0, index_col=None,chunksize=chunksize)):
        chart=chart[chart['stay_id'].isin(data['stay_id'])]
        chart[['start_days', 'dummy','start_hours']] = chart['event_time_from_admit'].str.split(' ', -1, expand=True)
        chart[['start_hours','min','sec']] = chart['start_hours'].str.split(':', -1, expand=True)
        chart['start_time']=pd.to_numeric(chart['start_days'])*24+pd.to_numeric(chart['start_hours'])
        chart=chart.drop(columns=['start_days', 'dummy','start_hours','min','sec','event_time_from_admit'])
        chart=chart[chart['start_time']>=0]

        ###Remove where event time is after discharge time
        chart=pd.merge(chart,data[['stay_id','los']],on='stay_id',how='left')
        chart['sanity']=chart['los']-chart['start_time']
        chart=chart[chart['sanity']>0]
        del chart['sanity']
        del chart['los']
        
        if final.empty:
            final=chart
        else:
            final=final.append(chart, ignore_index=True)
    
    return final
    

def generate_labs():
    chunksize = 10000000
    final=pd.DataFrame()
    for labs in tqdm(pd.read_csv(local+ "/features/preproc_labs.csv.gz", compression='gzip', header=0, index_col=None,chunksize=chunksize)):
        labs=labs[labs['hadm_id'].isin(data['hadm_id'])]
        labs[['start_days', 'dummy','start_hours']] = labs['lab_time_from_admit'].str.split(' ', -1, expand=True)
        labs[['start_hours','min','sec']] = labs['start_hours'].str.split(':', -1, expand=True)
        labs['start_time']=pd.to_numeric(labs['start_days'])*24+pd.to_numeric(labs['start_hours'])
        labs=labs.drop(columns=['start_days', 'dummy','start_hours','min','sec'])
        labs=labs[labs['start_time']>=0]

        ###Remove where event time is after discharge time
        labs=pd.merge(labs,data[['hadm_id','los']],on='hadm_id',how='left')
        labs['sanity']=labs['los']-labs['start_time']
        labs=labs[labs['sanity']>0]
        del labs['sanity']
        
        if final.empty:
            final=labs
        else:
            final=final.append(labs, ignore_index=True)

    return final
    
    
def generate_meds():
    meds=pd.read_csv(local+ "/features/preproc_med_icu.csv.gz", compression='gzip', header=0, index_col=None)
    meds[['start_days', 'dummy','start_hours']] = meds['start_hours_from_admit'].str.split(' ', -1, expand=True)
    meds[['start_hours','min','sec']] = meds['start_hours'].str.split(':', -1, expand=True)
    meds['start_time']=pd.to_numeric(meds['start_days'])*24+pd.to_numeric(meds['start_hours'])
    meds[['start_days', 'dummy','start_hours']] = meds['stop_hours_from_admit'].str.split(' ', -1, expand=True)
    meds[['start_hours','min','sec']] = meds['start_hours'].str.split(':', -1, expand=True)
    meds['stop_time']=pd.to_numeric(meds['start_days'])*24+pd.to_numeric(meds['start_hours'])
    meds=meds.drop(columns=['start_days', 'dummy','start_hours','min','sec'])
    #####Sanity check
    meds['sanity']=meds['stop_time']-meds['start_time']
    meds=meds[meds['sanity']>0]
    del meds['sanity']
    #####Select hadm_id as in main file
    meds=meds[meds['stay_id'].isin(data['stay_id'])]
    meds=pd.merge(meds,data[['stay_id','los']],on='stay_id',how='left')

    #####Remove where start time is after end of visit
    meds['sanity']=meds['los']-meds['start_time']
    meds=meds[meds['sanity']>0]
    del meds['sanity']
    ####Any stop_time after end of visit is set at end of visit
    meds.loc[meds['stop_time'] > meds['los'],'stop_time']=meds.loc[meds['stop_time'] > meds['los'],'los']
    del meds['los']
    
    meds['rate']=meds['rate'].apply(pd.to_numeric, errors='coerce')
    meds['amount']=meds['amount'].apply(pd.to_numeric, errors='coerce')
    
    return meds
    
def generate_ing():
    ing=pd.read_csv(local+ "/features/preproc_ing_icu.csv.gz", compression='gzip', header=0, index_col=None)
    ing[['start_days', 'dummy','start_hours']] = ing['start_hours_from_admit'].str.split(' ', -1, expand=True)
    ing[['start_hours','min','sec']] = ing['start_hours'].str.split(':', -1, expand=True)
    ing['start_time']=pd.to_numeric(ing['start_days'])*24+pd.to_numeric(ing['start_hours'])
    ing[['start_days', 'dummy','start_hours']] = ing['stop_hours_from_admit'].str.split(' ', -1, expand=True)
    ing[['start_hours','min','sec']] = ing['start_hours'].str.split(':', -1, expand=True)
    ing['stop_time']=pd.to_numeric(ing['start_days'])*24+pd.to_numeric(ing['start_hours'])
    ing=ing.drop(columns=['start_days', 'dummy','start_hours','min','sec'])
    #####Sanity check
    ing['sanity']=ing['stop_time']-ing['start_time']
    ing=ing[ing['sanity']>0]
    del ing['sanity']
    #####Select hadm_id as in main file
    ing=ing[ing['stay_id'].isin(data['stay_id'])]
    ing=pd.merge(ing,data[['stay_id','los']],on='stay_id',how='left')

    #####Remove where start time is after end of visit
    ing['sanity']=ing['los']-ing['start_time']
    ing=ing[ing['sanity']>0]
    del ing['sanity']
    ####Any stop_time after end of visit is set at end of visit
    ing.loc[ing['stop_time'] > ing['los'],'stop_time']=ing.loc[ing['stop_time'] > ing['los'],'los']
    del ing['los']
    
    ing['rate']=ing['rate'].apply(pd.to_numeric, errors='coerce')
    ing['amount']=ing['amount'].apply(pd.to_numeric, errors='coerce')
    
    return ing


def get_stay_id(labs, chart):
            
    stay = pd.read_csv(root_dir+"mimiciv/2.2"+"/icu/icustays.csv.gz")
    stay = stay[stay.notna()]
    
    chart['charttime'] = pd.to_datetime(chart['charttime'])
    labs['charttime'] = pd.to_datetime(labs['charttime'])

    stay['intime'] = pd.to_datetime(stay['intime'])
    stay['outtime'] = pd.to_datetime(stay['outtime'])


    labs['stay_id'] = np.nan
    result = []

    unique_patient_ids = labs['subject_id'].unique()

    for p in tqdm(range(len(unique_patient_ids))):
        
        p_id = unique_patient_ids[p]
        
        lab = labs[labs['subject_id']==p_id].copy().sort_values('charttime').reset_index(drop=True)
        stay_interest = stay[stay['subject_id']==p_id].copy()
        
        unique_stay_ids = stay_interest['stay_id'].unique()
        
        for s in  tqdm(range(len(unique_stay_ids)), leave=False):
            
            stay_id = unique_stay_ids[s]
            
            stay_interest2 = stay_interest[stay_interest['stay_id']==stay_id].copy()
            
            indices = np.where((lab['charttime'].values >= stay_interest2['intime'].values) & 
                            (lab['charttime'].values <= stay_interest2['outtime'].values))

            lab['stay_id'].loc[indices[0]] = stay_id

            result.append(lab)
            
    result_df = pd.concat(result)
    labs = result_df[~(result_df['stay_id'].isnull())]
    return labs

data = generate_adm()
# ing = generate_ing()
# chart = generate_chart()
# labs = generate_labs()
# labs = get_stay_id(labs, chart)
proc = generate_proc()
# out = generate_out()
# meds = generate_meds()

# data.to_csv('check_point_data.csv', compression = 'gzip')
# ing.to_csv('check_point_ing.csv', compression = 'gzip')
# chart.to_csv('check_point_chart.csv', compression = 'gzip')
# labs.to_csv('check_point_labs.csv', compression = 'gzip')
# proc.to_csv('check_point_proc.csv', compression = 'gzip')
# # out.to_csv('check_point_out.csv', compression = 'gzip')
# meds.to_csv('check_point_meds.csv', compression = 'gzip')

In [40]:
# import pandas as pd
# import numpy as np
# from tqdm import tqdm

# data = pd.read_csv('check_point_data.csv', index_col = 0, compression = 'gzip')
# ing = pd.read_csv('check_point_ing.csv', index_col = 0, compression = 'gzip')
# chart = pd.read_csv('check_point_chart.csv', index_col = 0, compression = 'gzip')
# labs = pd.read_csv('check_point_labs.csv', index_col = 0, compression = 'gzip')
# proc = pd.read_csv('check_point_proc.csv', index_col = 0, compression = 'gzip')
# # out = pd.read_csv('check_point_out.csv', index_col = 0, compression = 'gzip')
# meds = pd.read_csv('check_point_meds.csv', index_col = 0, compression = 'gzip')

In [5]:
include_start_time = 1*24
include_end_time =60*24

def cohort_restirction(include_start_time,include_end_time,data, proc):
    print("include start time",include_start_time)
    print("include end time",include_end_time)
    
    data=data[(data['los'] >= include_start_time)]
    data=data[(data['los'] <= include_end_time)]
    hids=data['stay_id'].unique()
    print('num of patient: ', len(data.subject_id.unique()))
    print('num of stay: ', len(hids))

    # cond=cond[cond['stay_id'].isin(data['stay_id'])]
    
    # ###MEDS

    # meds=meds[meds['stay_id'].isin(data['stay_id'])]
    # meds=meds[meds['start_time'] <= include_end_time]
    # meds.loc[meds.stop_time > include_end_time, 'stop_time']=include_end_time
        
    # ###ING

    # ing=ing[ing['stay_id'].isin(data['stay_id'])]
    # ing=ing[ing['start_time'] <= include_end_time]
    # ing.loc[ing.stop_time > include_end_time, 'stop_time']=include_end_time
                
    
    ###PROCS

    proc=proc[proc['stay_id'].isin(data['stay_id'])]
    proc=proc[proc['start_time']<=include_end_time]
        
    # ###OUT

    # out=out[out['stay_id'].isin(data['stay_id'])]
    # out=out[out['start_time']<=include_end_time]
        
    # ###CHART

    # chart=chart[chart['stay_id'].isin(data['stay_id'])]
    # chart=chart[chart['start_time']<=include_end_time]
        
    # ###LAB

    # labs=labs[labs['stay_id'].isin(data['stay_id'])]
    # labs=labs[labs['start_time']<=include_end_time]
    
    return  data, proc

data_new, proc_new = cohort_restirction(include_start_time,include_end_time ,data, proc)

include start time 24
include end time 1440
num of patient:  42253
num of stay:  57699


In [6]:
final_meds=pd.DataFrame()
final_ing=pd.DataFrame()
final_proc=pd.DataFrame()
# final_out=pd.DataFrame()
final_chart=pd.DataFrame()
final_labs=pd.DataFrame()


# meds=meds_new.sort_values(by=['start_time'])
# ing=ing_new.sort_values(by=['start_time'])
proc=proc_new.sort_values(by=['start_time'])
# out=out_new.sort_values(by=['start_time'])
# chart=chart_new.sort_values(by=['start_time'])
# labs=labs_new.sort_values(by=['start_time'])

hids=data_new['stay_id'].unique()

In [6]:
sample_data = pd.concat([chart_new[['stay_id', 'itemid']], labs_new[['stay_id', 'itemid']]], axis = 0)

In [7]:
# Specify the item_ids we are interested in
required_item_ids = {220045, 225668, 50813, 220050, 220051}


# Find the stay_ids that have all the required item_ids at least once
valid_stay_ids = sample_data[sample_data['itemid'].isin(required_item_ids)].groupby('stay_id')['itemid'].nunique()
valid_stay_ids = valid_stay_ids[valid_stay_ids == len(required_item_ids)].index

In [8]:
meds_1 = meds_new[meds_new['stay_id'].isin(valid_stay_ids)]
ing_1 = ing_new[ing_new['stay_id'].isin(valid_stay_ids)]
proc_1 = proc_new[proc_new['stay_id'].isin(valid_stay_ids)]
# out_1 = out[out_new['stay_id'].isin(valid_stay_ids)]
chart_1 = chart_new[chart_new['stay_id'].isin(valid_stay_ids)]
labs_1 = labs_new[labs_new['stay_id'].isin(valid_stay_ids)]
data_1 = data_new[data_new['stay_id'].isin(valid_stay_ids)]

In [9]:
#예상 관측치 수
data_1['los'].sum()

2998863

In [7]:
dataset = pd.read_csv('/Users/DAHS/MIMIC-IV-Data-Pipeline/MIMIC_pipeline/Case Labeling/mimic_df.csv.gz', index_col = 0, compression='gzip')
valid_stay_ids = dataset.stay_id.unique()

In [8]:
proc_1 = proc_new[proc_new['stay_id'].isin(valid_stay_ids)]

In [11]:
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('mode.chained_assignment',  None) 

# final_meds = meds[(meds['start_time']>0)&(meds['stop_time']>0)].copy()
# final_ing= ing[(ing['start_time']>0)&(ing['stop_time']>0)].copy()
final_proc= proc.copy() 
# final_out= out.copy()
# final_chart= chart.copy()
# final_labs= labs.copy()

feat_med = False
feat_ing = False
feat_proc = True
feat_out = False
feat_chart = False
impute = False
feat_lab = False

for hid in tqdm(valid_stay_ids, desc = 'Tabularize EHR for total stay 10,268'):
    grp=data[data['stay_id']==hid]
    los = int(grp['los'].values)
    
    if not os.path.exists(local+"/csv/"+str(hid)):
        os.makedirs(local+"/csv/"+str(hid))
    
    dyn_csv=pd.DataFrame()
    
    ###MEDS
    if(feat_med):
        feat=final_meds['itemid'].unique()
        df2=final_meds[final_meds['stay_id']==hid]
        if df2.shape[0]==0:
            amount=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat)
            amount=amount.fillna(0)
            amount.columns=pd.MultiIndex.from_product([["MEDS"], amount.columns])
        else:
            amount=df2.pivot_table(index='start_time',columns='itemid',values='amount')
            df2=df2.pivot_table(index='start_time',columns='itemid',values='stop_time')

            add_indices = pd.Index(range(los)).difference(df2.index)
            add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan)
            df2=pd.concat([df2, add_df])
            df2=df2.sort_index()
            df2=df2.ffill()
            df2=df2.fillna(0)

            amount=pd.concat([amount, add_df])
            amount=amount.sort_index()
            amount=amount.ffill()
            amount=amount.fillna(0)
 
            df2.iloc[:,0:]=df2.iloc[:,0:].sub(df2.index,0)
            df2[df2>0]=1
            df2[df2<0]=0
       
            amount.iloc[:,0:]=df2.iloc[:,0:]*amount.iloc[:,0:]
            feat_df=pd.DataFrame(columns=list(set(feat)-set(amount.columns)))
            amount=pd.concat([amount,feat_df],axis=1)


            amount=amount[feat]
            amount=amount.fillna(0)
            amount.columns=pd.MultiIndex.from_product([["MEDS"], amount.columns])
     
            
        if(dyn_csv.empty):
            dyn_csv=amount
        else:
            dyn_csv=pd.concat([dyn_csv,amount],axis=1)
        
    
    ###INGS
    if(feat_ing):
        feat=final_ing['itemid'].unique()
        df2=final_ing[final_ing['stay_id']==hid]
        if df2.shape[0]==0:
            amount=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat)
            amount=amount.fillna(0)
            amount.columns=pd.MultiIndex.from_product([["INGS"], amount.columns])
        else:
            amount=df2.pivot_table(index='start_time',columns='itemid',values='amount')
            df2=df2.pivot_table(index='start_time',columns='itemid',values='stop_time')
            add_indices = pd.Index(range(los)).difference(df2.index)
            add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan)
            df2=pd.concat([df2, add_df])
            df2=df2.sort_index()
            df2=df2.ffill()
            df2=df2.fillna(0)

            amount=pd.concat([amount, add_df])
            amount=amount.sort_index()
            amount=amount.ffill()
            amount=amount.fillna(0)
            
            df2.iloc[:,0:]=df2.iloc[:,0:].sub(df2.index,0)
            df2[df2>0]=1
            df2[df2<0]=0

            amount.iloc[:,0:]=df2.iloc[:,0:]*amount.iloc[:,0:]
            feat_df=pd.DataFrame(columns=list(set(feat)-set(amount.columns)))
            amount=pd.concat([amount,feat_df],axis=1)

            amount=amount[feat]
            amount=amount.fillna(0)
            
            amount.columns=pd.MultiIndex.from_product([["INGS"], amount.columns])
            
        if(dyn_csv.empty):
            dyn_csv=amount
        else:
            dyn_csv=pd.concat([dyn_csv,amount],axis=1)
        
        
    
    
    ###PROCS
    if(feat_proc):
        feat=final_proc['itemid'].unique()
        df2=final_proc[final_proc['stay_id']==hid]
        if df2.shape[0]==0:
            amount=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat)
            amount=amount.fillna(0)
            amount.columns=pd.MultiIndex.from_product([["PROC"], amount.columns])
        else:
            df2['val']=1
            amount=df2.pivot_table(index='start_time',columns='itemid',values='val')
            df2=df2.pivot_table(index='start_time',columns='itemid',values='stop_time')
            add_indices = pd.Index(range(los)).difference(df2.index)
            add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan)
            
            df2=pd.concat([df2, add_df])
            df2=df2.sort_index()
            df2=df2.fillna(0)
            df2[df2>0]=1
            
            amount=pd.concat([amount, add_df])
            amount=amount.sort_index()
            amount=amount.ffill()
            amount=amount.fillna(0)
            
            df2.iloc[:,0:]=df2.iloc[:,0:].sub(df2.index,0)
            df2[df2>0]=1
            df2[df2<0]=0

            amount.iloc[:,0:]=df2.iloc[:,0:]*amount.iloc[:,0:]
            feat_df=pd.DataFrame(columns=list(set(feat)-set(amount.columns)))
            amount=pd.concat([amount,feat_df],axis=1)

            amount=amount[feat]
            amount=amount.fillna(0)
            
            amount.columns=pd.MultiIndex.from_product([["PROC"], amount.columns])
            
        if(dyn_csv.empty):
            dyn_csv=amount
        else:
            dyn_csv=pd.concat([dyn_csv,amount],axis=1)
        
        
    ###OUT
    if(feat_out):
        feat=final_out['itemid'].unique()
        df2=final_out[final_out['stay_id']==hid]
    
        if df2.shape[0]==0:
            val=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat)
            val=val.fillna(0)
            val.columns=pd.MultiIndex.from_product([["OUT"], val.columns])
        else:
            val=df2.pivot_table(index='start_time',columns='itemid',values='value')
            df2['val']=1
            df2=df2.pivot_table(index='start_time',columns='itemid',values='val')

            add_indices = pd.Index(range(los)).difference(df2.index)
            add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan)
            df2=pd.concat([df2, add_df])
            df2=df2.sort_index()
            df2=df2.fillna(0)

            val=pd.concat([val, add_df])
            val=val.sort_index()
            val=val.fillna(0)
            
            df2[df2>0]=1
            df2[df2<0]=0

            feat_df=pd.DataFrame(columns=list(set(feat)-set(val.columns)))
            val=pd.concat([val,feat_df],axis=1)

            val=val[feat]
            val=val.fillna(0)
            val.columns=pd.MultiIndex.from_product([["OUT"], val.columns])
        
        if(dyn_csv.empty):
            dyn_csv=val
        else:
            dyn_csv=pd.concat([dyn_csv,val],axis=1)
            
        
    ###CHART
    if(feat_chart):
        feat=final_chart['itemid'].unique()
        df2=final_chart[final_chart['stay_id']==hid]
        if df2.shape[0]==0:
            val=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat)
            val=val.fillna(0)
            val.columns=pd.MultiIndex.from_product([["CHART"], val.columns])
        else:
            val=df2.pivot_table(index='start_time',columns='itemid',values='valuenum')
            df2['val']=1
            df2=df2.pivot_table(index='start_time',columns='itemid',values='val')
            #print(df2.shape)
            add_indices = pd.Index(range(los)).difference(df2.index)
            add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan)
            df2=pd.concat([df2, add_df])
            df2=df2.sort_index()
            df2=df2.fillna(0)

            val=pd.concat([val, add_df])
            val=val.sort_index()
            if impute:
                val=val.ffill()

            df2[df2>0]=1
            df2[df2<0]=0

            feat_df=pd.DataFrame(columns=list(set(feat)-set(val.columns)))
            val=pd.concat([val,feat_df],axis=1)

            val=val[feat]
            val.columns=pd.MultiIndex.from_product([["CHART"], val.columns])
        
        if(dyn_csv.empty):
            dyn_csv=val
        else:
            dyn_csv=pd.concat([dyn_csv,val],axis=1)
    
    ###LABS
    if(feat_lab):
        feat=final_labs['itemid'].unique()
        df2=final_labs[final_labs['stay_id']==hid]
        if df2.shape[0]==0:
            val=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat)
            val=val.fillna(0)
            val.columns=pd.MultiIndex.from_product([["LAB"], val.columns])
        else:
            val=df2.pivot_table(index='start_time',columns='itemid',values='valuenum')
            df2['val']=1
            df2=df2.pivot_table(index='start_time',columns='itemid',values='val')
            add_indices = pd.Index(range(los)).difference(df2.index)
            add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan)
            df2=pd.concat([df2, add_df])
            df2=df2.sort_index()
            df2=df2.fillna(0)

            val=pd.concat([val, add_df])
            val=val.sort_index()
            if impute:
                val=val.ffill()

            df2[df2>0]=1
            df2[df2<0]=0
            
            feat_df=pd.DataFrame(columns=list(set(feat)-set(val.columns)))
            val=pd.concat([val,feat_df],axis=1)

            val=val[feat]
            val.columns=pd.MultiIndex.from_product([["LAB"], val.columns])
        
        if(dyn_csv.empty):
            dyn_csv=val
        else:
            dyn_csv=pd.concat([dyn_csv,val],axis=1)
    
    #Save temporal data to csv
    dyn_csv.to_csv(local+'/csv/'+str(hid)+'/dynamic_proc.csv',index=False)

Tabularize EHR for total stay 10,268:  50%|█████     | 10336/20549 [06:34<06:29, 26.20it/s]


KeyboardInterrupt: 

In [12]:
valid_stay_ids

data_1.to_csv(local+'/demo.csv')