# Merge Time Series Features 

* by Firuz Juraev (Combined Master/PhD student)

In [1]:
import pandas as pd 
from datetime import datetime
import seaborn as sns
import glob
from os import listdir 
import os.path
from os import path
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

In [2]:
def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    print()
    return

In [3]:
def list_files(directory):
    files = []
    counter = 0
    for f in listdir(directory):
        if f.endswith('.' + "csv"):
            files.append(f)
            counter = counter + 1
    print ("CSV Files: " + str(counter))
    return files

### Define variables 

In [8]:
directory_FB = "FinalData/TimeSeries/Clean_Data_FB/" 
directory_Mean = "FinalData/TimeSeries/Clean_Data_Mean/"
directory_NaN = "FinalData/TimeSeries/Clean_Data_NaN/"

Main_directory = "FinalData/"

### Functions 

In [23]:
def mergeTS(df_list, df_name_list, filename, los_df):
    df_list[0].rename(columns={'VALUENUM':df_name_list[0]}, inplace=True)
    
    for i in range(1, len(df_list)): 
        df_list[i].drop(['HADM_ID', 'subject_id', 'CHARTTIME'], axis=1, inplace=True) 
        df_list[i].rename(columns={'VALUENUM':df_name_list[i]}, inplace=True)
    
    df = df_list[0]
    
    for i in range(1, len(df_list)): 
        ## Concatining 
        df = pd.concat([df, df_list[i]], axis=1)
    
    df['CHARTTIME'] = df.groupby(['subject_id'])['CHARTTIME'].cumcount() 
    
    side_by_side(df.isnull().sum(), df.count())
    
    df = pd.merge(df, los_df, on='HADM_ID', how='inner')
    
    if path.exists(filename): 
        df.to_csv(filename, mode='a', header=False, index=False)
    else: 
        df.to_csv(filename, index=False, header=True)
    
    print ("200OK - Success")

In [24]:
def merging_data_in_folder(directory, los_df, filename): 
    df_list = []
    df_name_list = [] 
    
    files = list_files(directory) 
     
    for file in files: 
        df_name_list.append(file.replace(".csv", ""))  
        
        df = pd.read_csv(directory + file) 
        df_list.append(df) 
    
    mergeTS(df_list, df_name_list, Main_directory + filename, los_df)

### Run 

In [16]:
neonates = pd.read_csv("FinalData/NeonatesSingleValues.csv") 

neonates.head(2)

Unnamed: 0,subject_id,HADM_ID,GENDER,ETHNICITY,DATEEVENTS,DW10_COUNT,D10W_SUM,D10W_MEAN,URINE_COUNT,URINE_SUM,...,HEAD_CIRC,BANDS,MONOs,EOSINOPHILS,NEUTS,LYMPHS,PLATELET,DEAD,POD,LOS
0,258,189406,F,1.0,0.0,22.0,116.400001,5.290909,5.0,108.0,...,30.5,1.0,7.0,4.0,23.0,65.0,159.0,0,0,3.495139
1,260,190363,F,23.0,0.0,23.0,233.400002,10.147826,6.0,122.0,...,34.0,0.0,4.0,0.0,26.0,70.0,340.0,0,0,7.025694


In [17]:
neonates.columns

Index(['subject_id', 'HADM_ID', 'GENDER', 'ETHNICITY', 'DATEEVENTS',
       'DW10_COUNT', 'D10W_SUM', 'D10W_MEAN', 'URINE_COUNT', 'URINE_SUM',
       'URINE_AVG', 'MICROBIOLOGY_TEST', 'NEGATIVE_RESULT', 'PRESCRIPTIONS',
       'BIRTH_WEIGHT', 'HEAD_CIRC', 'BANDS', 'MONOs', 'EOSINOPHILS', 'NEUTS',
       'LYMPHS', 'PLATELET', 'DEAD', 'POD', 'LOS'],
      dtype='object')

In [18]:
neonates.drop(['subject_id', 'GENDER', 'ETHNICITY', 'DATEEVENTS',
       'DW10_COUNT', 'D10W_SUM', 'D10W_MEAN', 'URINE_COUNT', 'URINE_SUM',
       'URINE_AVG', 'MICROBIOLOGY_TEST', 'NEGATIVE_RESULT', 'PRESCRIPTIONS',
       'BIRTH_WEIGHT', 'HEAD_CIRC', 'BANDS', 'MONOs', 'EOSINOPHILS', 'NEUTS',
       'LYMPHS', 'PLATELET', 'DEAD', 'POD'], axis=1, inplace=True)

=======================================================================================

## Merging Time Series Features [Run]

In [21]:
neonates.head(1)

Unnamed: 0,HADM_ID,LOS
0,189406,3.495139


In [25]:
filename_FB   = "NeonatesTimeSeries_FB.csv"
filename_Mean = "NeonatesTimeSeries_Mean.csv"  
filename_NaN  = "NeonatesTimeSeries_NaN.csv" 

merging_data_in_folder(directory_FB, neonates, filename_FB)
merging_data_in_folder(directory_Mean, neonates, filename_Mean)
merging_data_in_folder(directory_NaN, neonates, filename_NaN)

CSV Files: 13
subject_id         0    subject_id         75240
HADM_ID            0    HADM_ID            75240
CHARTTIME          0    CHARTTIME          75240
bpCuffDiastolic    0    bpCuffDiastolic    75240
bpCuffMean         0    bpCuffMean         75240
bpCuffSystolic     0    bpCuffSystolic     75240
glucometer         0    glucometer         75240
heartRate          0    heartRate          75240
hrAlarmHigh        0    hrAlarmHigh        75240
hrAlarmLow         0    hrAlarmLow         75240
respRate           0    respRate           75240
sao2               0    sao2               75240
sao2AlarmHigh      0    sao2AlarmHigh      75240
sao2AlarmLow       0    sao2AlarmLow       75240
skinTemperature    0    skinTemperature    75240
temperature        0    temperature        75240
dtype: int64            dtype: int64            

200OK - Success
CSV Files: 13
subject_id         0    subject_id         75240
HADM_ID            0    HADM_ID            75240
CHARTTIME          0    

In [26]:
df = pd.read_csv(Main_directory + filename_FB) 

df.head()

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,bpCuffDiastolic,bpCuffMean,bpCuffSystolic,glucometer,heartRate,hrAlarmHigh,hrAlarmLow,respRate,sao2,sao2AlarmHigh,sao2AlarmLow,skinTemperature,temperature,LOS
0,14509,100029,0,43.0,49.0,59.0,61.0,154.0,200.0,80.0,56.0,95.0,100.0,90.0,36.5,36.599998,15.006944
1,14509,100029,1,43.0,49.0,59.0,61.0,140.0,200.0,80.0,44.0,99.0,100.0,90.0,36.599998,36.599998,15.006944
2,14509,100029,2,43.0,49.0,59.0,61.0,148.0,200.0,80.0,40.0,98.0,100.0,90.0,36.799999,36.599998,15.006944
3,14509,100029,3,39.0,45.0,55.0,61.0,136.0,200.0,80.0,40.0,100.0,100.0,90.0,36.799999,36.599998,15.006944
4,14509,100029,4,39.0,45.0,55.0,61.0,132.0,200.0,80.0,56.0,100.0,100.0,90.0,36.700001,36.599998,15.006944
