# Merging Single Value Features

* by Firuz Juraev (Combined Master/PhD student)

In [21]:
import pandas as pd 
from datetime import datetime
import seaborn as sns
import glob
from os import listdir 
import os.path
from os import path
import numpy as np 

from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings('ignore')

In [3]:
directory = "FinalData/SingleValue/" 

### Functions 

In [2]:
def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    print()
    return

In [4]:
def list_files(directory):
    files = []
    counter = 0
    for f in listdir(directory):
        if f.endswith('.' + "csv"):
            files.append(f)
            counter = counter + 1
    print ("CSV Files: " + str(counter))
    return files

In [5]:
list_files(directory)

CSV Files: 4


['birth_weight.csv',
 'blood_test_features.csv',
 'head_circ.csv',
 'patients_single_value_data.csv']

### Load Patients' Data 

In [8]:
patients = pd.read_csv(directory + "patients_single_value_data.csv") 

In [9]:
patients.head(2)

Unnamed: 0,SUBJECT_ID,HADM_ID,DOB,GENDER,ETHNICITY,ADMITTIME,DISCHTIME,DISCHARGE_LOCATION,EXPIRE_FLAG,POD,...,DW10_COUNT,D10W_SUM,D10W_MEAN,URINE_COUNT,URINE_SUM,URINE_AVG,MICROBIOLOGY_TEST,NEGATIVE_RESULT,PRESCRIPTIONS,LOS
0,258,189406,2124-09-19 00:00:00,F,ASIAN,2124-09-19 03:59:00,2124-09-22 15:52:00,HOME,0,0,...,22.0,116.400001,5.290909,5.0,108.0,21.6,1.0,0.0,0.0,3.495139
1,260,190363,2105-03-23 00:00:00,F,WHITE,2105-03-23 10:23:00,2105-03-30 11:00:00,HOME,0,0,...,23.0,233.400002,10.147826,6.0,122.0,20.333333,1.0,0.0,4.0,7.025694


### Birth Weight 

In [10]:
birth_weight = pd.read_csv(directory + "birth_weight.csv")  

birth_weight.head(2)

Unnamed: 0,HADM_ID,VALUENUM
0,184167,1.385
1,106266,2.775


In [11]:
birth_weight.rename(columns = {'VALUENUM': 'BIRTH_WEIGHT'}, inplace=True) 

In [14]:
patients = pd.merge(patients, birth_weight, on='HADM_ID', how='inner') 

patients.head(2)

Unnamed: 0,SUBJECT_ID,HADM_ID,DOB,GENDER,ETHNICITY,ADMITTIME,DISCHTIME,DISCHARGE_LOCATION,EXPIRE_FLAG,POD,...,D10W_SUM,D10W_MEAN,URINE_COUNT,URINE_SUM,URINE_AVG,MICROBIOLOGY_TEST,NEGATIVE_RESULT,PRESCRIPTIONS,LOS,BIRTH_WEIGHT
0,258,189406,2124-09-19 00:00:00,F,ASIAN,2124-09-19 03:59:00,2124-09-22 15:52:00,HOME,0,0,...,116.400001,5.290909,5.0,108.0,21.6,1.0,0.0,0.0,3.495139,2.065
1,260,190363,2105-03-23 00:00:00,F,WHITE,2105-03-23 10:23:00,2105-03-30 11:00:00,HOME,0,0,...,233.400002,10.147826,6.0,122.0,20.333333,1.0,0.0,4.0,7.025694,3.0


### Head Circ 

In [15]:
head_circ = pd.read_csv(directory + "head_circ.csv")  

head_circ.head(2)

Unnamed: 0,HADM_ID,VALUENUM
0,184167,27.5
1,106266,34.0


In [16]:
head_circ.rename(columns = {'VALUENUM': 'HEAD_CIRC'}, inplace=True) 

In [17]:
patients = pd.merge(patients, head_circ, on='HADM_ID', how='inner') 

patients.head(2)

Unnamed: 0,SUBJECT_ID,HADM_ID,DOB,GENDER,ETHNICITY,ADMITTIME,DISCHTIME,DISCHARGE_LOCATION,EXPIRE_FLAG,POD,...,D10W_MEAN,URINE_COUNT,URINE_SUM,URINE_AVG,MICROBIOLOGY_TEST,NEGATIVE_RESULT,PRESCRIPTIONS,LOS,BIRTH_WEIGHT,HEAD_CIRC
0,258,189406,2124-09-19 00:00:00,F,ASIAN,2124-09-19 03:59:00,2124-09-22 15:52:00,HOME,0,0,...,5.290909,5.0,108.0,21.6,1.0,0.0,0.0,3.495139,2.065,30.5
1,260,190363,2105-03-23 00:00:00,F,WHITE,2105-03-23 10:23:00,2105-03-30 11:00:00,HOME,0,0,...,10.147826,6.0,122.0,20.333333,1.0,0.0,4.0,7.025694,3.0,34.0


### Blood Test 

In [18]:
blood_test = pd.read_csv(directory + "blood_test_features.csv")  

blood_test.head(2)

Unnamed: 0,HADM_ID,BANDS,MONOs,EOSINOPHILS,NEUTS,LYMPHS,PLATELET
0,184167,0.0,11.0,3.0,27.0,58.0,414.0
1,106266,0.0,8.0,0.0,29.0,62.0,347.0


In [19]:
patients = pd.merge(patients, blood_test, on='HADM_ID', how='inner') 

patients.head(2)

Unnamed: 0,SUBJECT_ID,HADM_ID,DOB,GENDER,ETHNICITY,ADMITTIME,DISCHTIME,DISCHARGE_LOCATION,EXPIRE_FLAG,POD,...,PRESCRIPTIONS,LOS,BIRTH_WEIGHT,HEAD_CIRC,BANDS,MONOs,EOSINOPHILS,NEUTS,LYMPHS,PLATELET
0,258,189406,2124-09-19 00:00:00,F,ASIAN,2124-09-19 03:59:00,2124-09-22 15:52:00,HOME,0,0,...,0.0,3.495139,2.065,30.5,1.0,7.0,4.0,23.0,65.0,159.0
1,260,190363,2105-03-23 00:00:00,F,WHITE,2105-03-23 10:23:00,2105-03-30 11:00:00,HOME,0,0,...,4.0,7.025694,3.0,34.0,0.0,4.0,0.0,26.0,70.0,340.0


## Some filtering  

In [22]:
patients.drop(['DISCHARGE_LOCATION'], axis=1, inplace=True)

In [25]:
patients.rename(columns={'EXPIRE_FLAG': 'DEAD'}, inplace=True)

In [23]:
ord_encoder = OrdinalEncoder()

In [24]:
#convert categorical columns to int  

patients["ETHNICITY"] = ord_encoder.fit_transform(patients[["ETHNICITY"]])  

patients.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,DOB,GENDER,ETHNICITY,ADMITTIME,DISCHTIME,EXPIRE_FLAG,POD,DATEEVENTS,...,PRESCRIPTIONS,LOS,BIRTH_WEIGHT,HEAD_CIRC,BANDS,MONOs,EOSINOPHILS,NEUTS,LYMPHS,PLATELET
0,258,189406,2124-09-19 00:00:00,F,1.0,2124-09-19 03:59:00,2124-09-22 15:52:00,0,0,0.0,...,0.0,3.495139,2.065,30.5,1.0,7.0,4.0,23.0,65.0,159.0
1,260,190363,2105-03-23 00:00:00,F,23.0,2105-03-23 10:23:00,2105-03-30 11:00:00,0,0,0.0,...,4.0,7.025694,3.0,34.0,0.0,4.0,0.0,26.0,70.0,340.0
2,685,103870,2126-07-29 00:00:00,F,23.0,2126-08-03 04:48:00,2126-08-29 18:40:00,0,0,0.0,...,21.0,26.577778,1.86,31.0,0.0,7.0,11.0,30.0,51.0,93.0
3,686,169424,2193-10-05 00:00:00,F,1.0,2193-10-05 03:19:00,2193-10-17 18:14:00,0,0,0.0,...,0.0,12.621528,2.035,29.5,3.0,5.0,2.0,48.0,40.0,266.0
4,692,123022,2118-11-29 00:00:00,F,23.0,2118-11-29 09:28:00,2118-12-09 17:20:00,0,0,0.0,...,0.0,10.327778,1.5,29.0,0.0,3.0,2.0,8.0,73.0,419.0


In [26]:
patients.columns

Index(['SUBJECT_ID', 'HADM_ID', 'DOB', 'GENDER', 'ETHNICITY', 'ADMITTIME',
       'DISCHTIME', 'DEAD', 'POD', 'DATEEVENTS', 'DW10_COUNT', 'D10W_SUM',
       'D10W_MEAN', 'URINE_COUNT', 'URINE_SUM', 'URINE_AVG',
       'MICROBIOLOGY_TEST', 'NEGATIVE_RESULT', 'PRESCRIPTIONS', 'LOS',
       'BIRTH_WEIGHT', 'HEAD_CIRC', 'BANDS', 'MONOs', 'EOSINOPHILS', 'NEUTS',
       'LYMPHS', 'PLATELET'],
      dtype='object')

In [28]:
patients = patients[['SUBJECT_ID', 'HADM_ID', 'DOB', 'GENDER', 'ETHNICITY', 'ADMITTIME',
       'DISCHTIME', 'DATEEVENTS', 'DW10_COUNT', 'D10W_SUM',
       'D10W_MEAN', 'URINE_COUNT', 'URINE_SUM', 'URINE_AVG',
       'MICROBIOLOGY_TEST', 'NEGATIVE_RESULT', 'PRESCRIPTIONS',
       'BIRTH_WEIGHT', 'HEAD_CIRC', 'BANDS', 'MONOs', 'EOSINOPHILS', 'NEUTS',
       'LYMPHS', 'PLATELET', 'DEAD', 'POD', 'LOS']]
                    
patients.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,DOB,GENDER,ETHNICITY,ADMITTIME,DISCHTIME,DATEEVENTS,DW10_COUNT,D10W_SUM,...,HEAD_CIRC,BANDS,MONOs,EOSINOPHILS,NEUTS,LYMPHS,PLATELET,DEAD,POD,LOS
0,258,189406,2124-09-19 00:00:00,F,1.0,2124-09-19 03:59:00,2124-09-22 15:52:00,0.0,22.0,116.400001,...,30.5,1.0,7.0,4.0,23.0,65.0,159.0,0,0,3.495139
1,260,190363,2105-03-23 00:00:00,F,23.0,2105-03-23 10:23:00,2105-03-30 11:00:00,0.0,23.0,233.400002,...,34.0,0.0,4.0,0.0,26.0,70.0,340.0,0,0,7.025694
2,685,103870,2126-07-29 00:00:00,F,23.0,2126-08-03 04:48:00,2126-08-29 18:40:00,0.0,22.0,202.099999,...,31.0,0.0,7.0,11.0,30.0,51.0,93.0,0,0,26.577778
3,686,169424,2193-10-05 00:00:00,F,1.0,2193-10-05 03:19:00,2193-10-17 18:14:00,0.0,0.0,0.0,...,29.5,3.0,5.0,2.0,48.0,40.0,266.0,0,0,12.621528
4,692,123022,2118-11-29 00:00:00,F,23.0,2118-11-29 09:28:00,2118-12-09 17:20:00,0.0,26.0,82.5,...,29.0,0.0,3.0,2.0,8.0,73.0,419.0,0,0,10.327778


In [29]:
patients.to_csv("FinalData/NeonatesSingleValues_FOR_STATISTICS.csv", index=False) 

In [30]:
patients.drop(['ADMITTIME', 'DOB', 'DISCHTIME'], axis=1, inplace=True)

patients.head(2)

Unnamed: 0,SUBJECT_ID,HADM_ID,GENDER,ETHNICITY,DATEEVENTS,DW10_COUNT,D10W_SUM,D10W_MEAN,URINE_COUNT,URINE_SUM,...,HEAD_CIRC,BANDS,MONOs,EOSINOPHILS,NEUTS,LYMPHS,PLATELET,DEAD,POD,LOS
0,258,189406,F,1.0,0.0,22.0,116.400001,5.290909,5.0,108.0,...,30.5,1.0,7.0,4.0,23.0,65.0,159.0,0,0,3.495139
1,260,190363,F,23.0,0.0,23.0,233.400002,10.147826,6.0,122.0,...,34.0,0.0,4.0,0.0,26.0,70.0,340.0,0,0,7.025694


In [31]:
patients.rename(columns={'SUBJECT_ID': 'subject_id'}, inplace=True)

### Submit 

In [32]:
patients.to_csv("FinalData/NeonatesSingleValues.csv", index=False)

THE END