# Handling Irregular Time Interval

* Firuz Juraev (PhD student)

In [13]:
import pandas as pd 
from datetime import datetime
import seaborn as sns
import glob
from os import listdir 
import os.path
from os import path
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

In [2]:
def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    print()
    return

In [11]:
def list_files(directory):
    files = []
    counter = 0
    for f in listdir(directory):
        if f.endswith('.' + "csv"):
            files.append(f)
            counter = counter + 1
    print ("CSV Files: " + str(counter))
    return files

### Defining functions 

In [7]:
## Filling NaN columns with Forward and Backward filling 

def fillNaN(df, filename): 
    if pd.isnull(df['VALUENUM'].iloc[0]) and df['row'].iloc[0] == 1: 
        df['VALUENUM'].fillna(method='bfill', inplace=True)
    else: 
        df['VALUENUM'].fillna(method='ffill', inplace=True)
    
    ## dropping row column 
    df.drop(['row'], axis=1, inplace=True)
    
    if path.exists(filename): 
        df.to_csv(filename, mode='a', header=False, index=False)
    else: 
        df.to_csv(filename, index=False, header=True)

        
## Filling NaN columns with Mean 
def fillNaN_Avg(df, filename):
    df['VALUENUM'].fillna(df['VALUENUM'].mean(), inplace=True)
    
    ## dropping row column 
    df.drop(['row'], axis=1, inplace=True)
    
    if path.exists(filename): 
        df.to_csv(filename, mode='a', header=False, index=False)
    else: 
        df.to_csv(filename, index=False, header=True)

### handleIrregularInterval function 

In [40]:
# Case 1: Forward Filling 
# Case 2: Mean Filling 
# Case 3: NaN 

def handleIrregularInterval(df, filename, case): 
    count = len(df) 
    extra, shortage = 0, 0  
    
    df['CHARTTIME'] = df['CHARTTIME'].astype('datetime64[ns]')
    
    df.drop(['ADMITTIME'], axis=1, inplace=True)
    
    
    if count > 24: 
        extra = count - 24 
    else: 
        shortage = 24 - count 
    
    
    ## blancing if rows are less than standard number of rows 
    if shortage > 0:   
        if case != 3: 
            df.drop(['row'], axis=1, inplace=True)
        else: 
            df.drop(['Unnamed: 0'], axis=1, inplace=True)
            
        dates = pd.date_range(start=df['CHARTTIME'].iloc[0], end=df['CHARTTIME'].iloc[count-1], periods=(shortage+2)).tolist()
        
        for i in range(0, shortage):
            df.loc[count+i] = [df['subject_id'].iloc[0], df['HADM_ID'].iloc[0], dates[i+1], np.nan] 
        
        ## Sorting Values by time 
        df.sort_values(by=['CHARTTIME'], inplace=True)
        
        df['row'] = df.groupby(['HADM_ID']).cumcount()+1
          
        
        # Balanced Data
        
        if case == 1:                    ## filling NaN with forward and backward filling 
            fillNaN(df, filename)
            
        elif case == 2:                  ## filling NaN with Mean 
            fillNaN_Avg(df, filename)
            
        else:                            ## don't fill 
            df.drop(['row'], axis=1, inplace=True) 
            
            if path.exists(filename): 
                df.to_csv(filename, mode='a', header=False, index=False)
            else: 
                df.to_csv(filename, index=False, header=True)

        return 1  
        
        
    #--------------------------------------------------------------------------------------------------    
    ## balancing if there are extra rows (standard number of rows = 24)
    if extra > 0:
        if case == 3: 
            df.drop(['Unnamed: 0'], axis=1, inplace=True) 
            ## Sort dataframe 
            df.sort_values(by=['HADM_ID', 'CHARTTIME'], inplace=True)  

            ## Giving row number to every row 
            df['row'] = df.groupby(['HADM_ID']).cumcount()+1
        
        
        df['DIFF'] = 0
        
        for i in range(1, count): 
            df['DIFF'].iloc[i] = abs(df['VALUENUM'].iloc[i] - df['VALUENUM'].iloc[i-1])
            
        
        small_diffs = df['DIFF'].nsmallest(extra+1).to_list() 
        
        small_diffs.pop(0)
        
        ## Counter for limiting deletion 
        counter = 0 
        
        ## Reindexing dataframe for droping some rows 
        df.reindex(df['row'].tolist())
        
        
        drop_list = []  
        
        ## Drop first Nan rows =================================
        nan_count = len(df[df.VALUENUM.isnull() == True]) 
        
        if nan_count > 0:  
            if (nan_count <= extra):
                drop_list = df.index[df.VALUENUM.isnull() ==True].tolist()
                
                ## drop the rows with nan values 
                for j in drop_list: 
                    df.drop(j, axis=0, inplace=True)

                extra = extra - nan_count 

            else: 
                drop_list = df.index[df.VALUENUM.isnull() ==True].tolist()
                
                ## drop the rows with nan values 
                for j in range(0, extra): 
                    df.drop(drop_list[j], axis=0, inplace=True)

                extra = 0 
        ## End of Nan removal =======================================    
                      
        if extra > 0:
            counter = 0 
            df.reindex(df['row'].tolist()) 
            drop_list = []
            
            for j in range(1, count): 
                if df['DIFF'].iloc[j] in small_diffs:
                    if counter == extra: 
                        break 

                    df['VALUENUM'].iloc[j] = round((df['VALUENUM'].iloc[j] + df['VALUENUM'].iloc[j-1])/2)

                    ## taking index of removing rows  
                    i = df[df['row'] == df['row'].iloc[j-1]].index                             

                    # Appending indeces to drop list 
                    drop_list.append(i) 

                    counter = counter + 1 


            ## droping rows 
            for i in range(0, len(drop_list)): 
                df.drop(drop_list[i], axis=0, inplace=True) 
        
        ## dropping DIFF column 
        df.drop(['DIFF', 'row'], axis=1, inplace=True)
        
        if path.exists(filename): 
            df.to_csv(filename, mode='a', header=False, index=False)
        else: 
            df.to_csv(filename, index=False, header=True)
        
        
        return 2
    
    #--------------------------------------------------------------------------------------------------
    if count == 24: 
        if case != 3: 
            df.drop(['row'], axis=1, inplace=True) 
        else: 
            df.drop(['Unnamed: 0'], axis=1, inplace=True) 
            
        
        if path.exists(filename): 
            df.to_csv(filename, mode='a', header=False, index=False)
        else: 
            df.to_csv(filename, index=False, header=True)
        
        return 3        

### Define constants 

In [35]:
directory_FB      = "Data/Data_FB/"
directory_Mean    = "Data/Data_Mean/" 
directory_NaN     = "Data/Data_NaN/" 

directory_FB_C    = "Data/Balanced/Clean_Data_FB/"
directory_Mean_C  = "Data/Balanced/Clean_Data_Mean/" 
directory_NaN_C   = "Data/Balanced/Clean_Data_NaN/" 

In [42]:
def handle(filename): 
    print ("===========================================")
    print ("File name: " + filename)  
    
    fb_df   = pd.read_csv(directory_FB + filename) 
    mean_df = pd.read_csv(directory_Mean + filename) 
    nan_df  = pd.read_csv(directory_NaN + filename) 
    
    sh_counter, ex_counter, ba_counter = 0, 0, 0 

    ## Sorting Data 
    fb_df.sort_values(by=['HADM_ID', 'CHARTTIME'], inplace=True) 
    mean_df.sort_values(by=['HADM_ID', 'CHARTTIME'], inplace=True) 
    nan_df.sort_values(by=['HADM_ID', 'CHARTTIME'], inplace=True) 

    ## Finding only unique values 
    fb_unique_hadm_ids   = fb_df['HADM_ID'].unique()
    mean_unique_hadm_ids = mean_df['HADM_ID'].unique()
    nan_unique_hadm_ids  = nan_df['HADM_ID'].unique()
    
    
    ## Handling Data that were filled with Forward & Backward Filling 
    for i in range(0, len(fb_unique_hadm_ids)): 
        r = handleIrregularInterval(fb_df.loc[fb_df['HADM_ID'] == fb_unique_hadm_ids[i]], directory_FB_C + filename, 1)
        if r == 1: sh_counter+= 1
        elif r == 2: ex_counter+= 1
        else: ba_counter+= 1 

    ## Checking 
    fb_check_df = pd.read_csv(directory_FB_C + filename)

    print ("===== FORWARD & BACKWARD FILLING DATA =====")
    print ("Total unbalanced group (<24): " + str(sh_counter))
    print ("Total unbalanced group (>24): " + str(ex_counter))
    print ("Total balanced group   (=24): " + str(ba_counter))

    print ("---------------------------------")
    side_by_side(fb_check_df.isnull().sum(), fb_check_df.count())
    
    
    
    ## Handling Data that were filled with Mean Filling 
    sh_counter, ex_counter, ba_counter = 0, 0, 0 
    
    for i in range(0, len(mean_unique_hadm_ids)): 
        r = handleIrregularInterval(mean_df.loc[fb_df['HADM_ID'] == mean_unique_hadm_ids[i]], directory_Mean_C + filename, 2)
        if r == 1: sh_counter+= 1
        elif r == 2: ex_counter+= 1
        else: ba_counter+= 1 

    ## Checking 
    mean_check_df = pd.read_csv(directory_Mean_C + filename)
    
    print ("=========== MEAN FILLING DATA ============")
    print ("Total unbalanced group (<24): " + str(sh_counter))
    print ("Total unbalanced group (>24): " + str(ex_counter))
    print ("Total balanced group   (=24): " + str(ba_counter))

    print ("---------------------------------")
    side_by_side(mean_check_df.isnull().sum(), mean_check_df.count())
    
    
    
    ## Handling Data that were NOT filled  
    sh_counter, ex_counter, ba_counter = 0, 0, 0 
    
    for i in range(0, len(nan_unique_hadm_ids)): 
        r = handleIrregularInterval(nan_df.loc[nan_df['HADM_ID'] == nan_unique_hadm_ids[i]], directory_NaN_C + filename, 3)
        if r == 1: sh_counter+= 1
        elif r == 2: ex_counter+= 1
        else: ba_counter+= 1 

    ## Checking 
    nan_check_df = pd.read_csv(directory_NaN_C + filename)
    
    print ("=========== NAN FILLING DATA ============")
    print ("Total unbalanced group (<24): " + str(sh_counter))
    print ("Total unbalanced group (>24): " + str(ex_counter))
    print ("Total balanced group   (=24): " + str(ba_counter))

    print ("---------------------------------")
    side_by_side(nan_check_df.isnull().sum(), nan_check_df.count())

### Handle 

In [37]:
files = list_files(directory_FB)
files 

CSV Files: 13


['bpCuffDiastolic.csv',
 'bpCuffMean.csv',
 'bpCuffSystolic.csv',
 'glucometer.csv',
 'heartRate.csv',
 'hrAlarmHigh.csv',
 'hrAlarmLow.csv',
 'respRate.csv',
 'sao2.csv',
 'sao2AlarmHigh.csv',
 'sao2AlarmLow.csv',
 'skinTemperature.csv',
 'temperature.csv']

In [43]:
for f in files: 
    handle(f) 

File name: bpCuffDiastolic.csv
===== FORWARD & BACKWARD FILLING DATA =====
Total unbalanced group (<24): 3063
Total unbalanced group (>24): 71
Total balanced group   (=24): 10
---------------------------------
subject_id     0    subject_id    75440
HADM_ID        0    HADM_ID       75440
CHARTTIME      0    CHARTTIME     75440
VALUENUM      15    VALUENUM      75425
dtype: int64        dtype: int64       

Total unbalanced group (<24): 3063
Total unbalanced group (>24): 71
Total balanced group   (=24): 10
---------------------------------
subject_id     0    subject_id    75440
HADM_ID        0    HADM_ID       75440
CHARTTIME      0    CHARTTIME     75440
VALUENUM      15    VALUENUM      75425
dtype: int64        dtype: int64       

Total unbalanced group (<24): 3063
Total unbalanced group (>24): 71
Total balanced group   (=24): 10
---------------------------------
subject_id        0    subject_id    75456
HADM_ID           0    HADM_ID       75456
CHARTTIME         0    CHARTTIME

===== FORWARD & BACKWARD FILLING DATA =====
Total unbalanced group (<24): 622
Total unbalanced group (>24): 2174
Total balanced group   (=24): 348
---------------------------------
subject_id     0    subject_id    75456
HADM_ID        0    HADM_ID       75456
CHARTTIME      0    CHARTTIME     75456
VALUENUM      24    VALUENUM      75432
dtype: int64        dtype: int64       

Total unbalanced group (<24): 622
Total unbalanced group (>24): 2174
Total balanced group   (=24): 348
---------------------------------
subject_id     0    subject_id    75456
HADM_ID        0    HADM_ID       75456
CHARTTIME      0    CHARTTIME     75456
VALUENUM      24    VALUENUM      75432
dtype: int64        dtype: int64       

Total unbalanced group (<24): 622
Total unbalanced group (>24): 2174
Total balanced group   (=24): 348
---------------------------------
subject_id       0    subject_id    75456
HADM_ID          0    HADM_ID       75456
CHARTTIME        0    CHARTTIME     75456
VALUENUM      599