# Data Imputation 

* by Firuz Juraev (PhD student)

In [26]:
import pandas as pd
from datetime import datetime
import seaborn as sns
import glob
from os import listdir 
import os.path
from os import path 

import warnings
warnings.filterwarnings('ignore')

In [2]:
def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    print()
    return

## Defining Functions 

In [23]:
def fillNaN(df, filename): 
    if pd.isnull(df['VALUENUM'].iloc[0]) and df['row'].iloc[0] == 1: 
        df['VALUENUM'].fillna(method='bfill', inplace=True)
    else: 
        df['VALUENUM'].fillna(method='ffill', inplace=True)
    
    if path.exists(filename): 
        df.to_csv(filename, mode='a', header=False, index=False)
    else: 
        df.to_csv(filename, index=False, header=True)

In [24]:
def fillNaN_Avg(df, filename):
    df['VALUENUM'].fillna(df['VALUENUM'].mean(), inplace=True)
    
    if path.exists(filename): 
        df.to_csv(filename, mode='a', header=False, index=False)
    else: 
        df.to_csv(filename, index=False, header=True)

In [37]:
directory_FB = "Data/Data_FB/"
directory_Mean = "Data/Data_Mean/" 
directory_NaN = "Data/Data_NaN/" 

main_directory = "Data/OutlierFreeData/"

In [38]:
def list_files(directory):
    files = []
    counter = 0
    for f in listdir(directory):
        if f.endswith('.' + "csv"):
            files.append(f)
            counter = counter + 1
    print ("CSV Files: " + str(counter))
    return files

In [39]:
list_files(main_directory)

CSV Files: 13


['bpCuffDiastolic.csv',
 'bpCuffMean.csv',
 'bpCuffSystolic.csv',
 'glucometer.csv',
 'heartRate.csv',
 'hrAlarmHigh.csv',
 'hrAlarmLow.csv',
 'respRate.csv',
 'sao2.csv',
 'sao2AlarmHigh.csv',
 'sao2AlarmLow.csv',
 'skinTemperature.csv',
 'temperature.csv']

In [16]:
list_files("Data_3145_TS_24/")[0].replace('_24.csv', '')

CSV Files: 13


'bp_cuff_diastolic'

### Data Imputation Function 

In [41]:
def data_imputation(filename): 
    
    df = pd.read_csv(main_directory + filename)
    
    #filename = filename.replace('_24.csv', '.csv')
    
    drop_list = []

    i = df[df['HADM_ID'] == 146110].index                             

    # Appending indeces to drop list  
    drop_list.append(i) 

    ## droping rows 
    for i in drop_list: 
        df.drop(i, axis=0, inplace=True)
    
    ## Not filling, just saving 
    df.to_csv(directory_NaN + filename, index=False)
    
    ## Sorting Data 
    df.sort_values(by=['HADM_ID', 'CHARTTIME'], inplace=True) 

    ## Giving row id by group 
    df['row'] = df.groupby(['HADM_ID']).cumcount()+1

    ## Finding only unique values 
    unique_hadm_ids = df['HADM_ID'].unique()
    
    ## Filling with Mean 
    for i in range(0, len(unique_hadm_ids)): 
        fillNaN_Avg(df.loc[df['HADM_ID'] == unique_hadm_ids[i]], directory_Mean + filename) 
        
    ## Filling with Mean 
    for i in range(0, len(unique_hadm_ids)): 
        fillNaN(df.loc[df['HADM_ID'] == unique_hadm_ids[i]], directory_FB + filename)  
    
    print(filename) 
    
    ## Checking 
    check_mean_df = pd.read_csv(directory_Mean + filename)
    check_fb_df = pd.read_csv(directory_FB + filename) 
    
    print("Unique Neonates NaN: " + str(df['HADM_ID'].nunique()))
    print("Unique Neonates Mean: " + str(check_mean_df['HADM_ID'].nunique())) 
    print("Unique Neonates FB: " + str(check_fb_df['HADM_ID'].nunique())) 
    
    side_by_side(check_fb_df.isnull().sum(), check_fb_df.count())
    

### Filling Data 

In [42]:
files = list_files(main_directory) 

for f in files: 
    data_imputation(f) 

CSV Files: 13
bpCuffDiastolic.csv
Unique Neonates NaN: 3144
Unique Neonates Mean: 3144
Unique Neonates FB: 3144
subject_id    0    subject_id    24509
HADM_ID       0    HADM_ID       24509
CHARTTIME     0    CHARTTIME     24509
VALUENUM      0    VALUENUM      24509
ADMITTIME     0    ADMITTIME     24509
row           0    row           24509
dtype: int64       dtype: int64       

bpCuffMean.csv
Unique Neonates NaN: 3144
Unique Neonates Mean: 3144
Unique Neonates FB: 3144
subject_id    0    subject_id    24612
HADM_ID       0    HADM_ID       24612
CHARTTIME     0    CHARTTIME     24612
VALUENUM      0    VALUENUM      24612
ADMITTIME     0    ADMITTIME     24612
row           0    row           24612
dtype: int64       dtype: int64       

bpCuffSystolic.csv
Unique Neonates NaN: 3144
Unique Neonates Mean: 3144
Unique Neonates FB: 3144
subject_id    0    subject_id    24619
HADM_ID       0    HADM_ID       24619
CHARTTIME     0    CHARTTIME     24619
VALUENUM      0    VALUENUM      