# Single Value Data from Chartevents
* by Firuz Juraev (PhD student)

In [1]:
import pandas as pd 
from datetime import datetime
import seaborn as sns
import glob
from os import listdir 
import os.path
from os import path
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

In [2]:
def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    print()
    return

### Variables 

In [3]:
directory = "Data_3145_SV_24/" 

In [94]:
bad_neonates = [100044, 100062, 100081, 159338, 155826, 187822, 145130, 146110, 120546, 157469]

### Functions 

In [4]:
def list_files(directory):
    files = []
    counter = 0
    for f in listdir(directory):
        if f.endswith('.' + "csv"):
            files.append(f)
            counter = counter + 1
    print ("CSV Files: " + str(counter))
    return files

In [16]:
def drop_defected_neonates_rows(df): 
    for neonate in bad_neonates: 
        drop_list = []

        i = df[df['HADM_ID'] == neonate].index                             

        # Appending indeces to drop list  
        drop_list.append(i) 

        ## droping rows 
        for i in drop_list: 
            df.drop(i, axis=0, inplace=True)
        
    side_by_side(df.isnull().sum(), df.count()) 
        
    return df 

In [48]:
def merge_blood_test(neonates_df, df, test_type): 
    df = df.groupby("HADM_ID")

    mean_df = df.mean()
    mean_df = mean_df.reset_index()

    mean_df.drop(['subject_id'], axis=1, inplace=True)
    
    mean_df.rename(columns={'VALUENUM':test_type}, inplace=True) 
    
    # Merge neonates with blood test  
    neonates_df = pd.merge(neonates_df, mean_df, on='HADM_ID', how='inner')
    
    print (test_type + " is added!")
    print ("Unique Neonates: " + str(neonates_df['HADM_ID'].nunique()))
    side_by_side(neonates_df.isnull().sum(), neonates_df.count()) 
    
    return neonates_df

### Run 

In [5]:
list_files(directory)

CSV Files: 8


['bands_24.csv',
 'birth_weight_24.csv',
 'eosinophils_24.csv',
 'head_circ_24.csv',
 'lymphs_24.csv',
 'monos_24.csv',
 'neuts_24.csv',
 'platelet_24.csv']

###  1.1 Birth Weight 

In [102]:
birth_weight = pd.read_csv(directory + "birth_weight_24.csv")

birth_weight.head(2)

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
0,10,184167,2103-06-28 10:00:00,1.385,2103-06-28 11:36:00
1,39,106266,2114-11-29 22:00:00,2.775,2114-11-29 21:04:00


In [103]:
birth_weight.shape 

(3145, 5)

In [104]:
birth_weight = drop_defected_neonates_rows(birth_weight) 

subject_id    0    subject_id    3135
HADM_ID       0    HADM_ID       3135
CHARTTIME     0    CHARTTIME     3135
VALUENUM      0    VALUENUM      3135
ADMITTIME     0    ADMITTIME     3135
dtype: int64       dtype: int64      



In [20]:
birth_weight.describe()

Unnamed: 0,subject_id,HADM_ID,VALUENUM
count,3137.0,3137.0,3137.0
mean,15404.719158,150236.069812,2.143438
std,9147.207164,28969.007466,0.841929
min,10.0,100029.0,0.364
25%,7506.0,125184.0,1.535
50%,15249.0,149876.0,2.065
75%,23069.0,175757.0,2.665
max,32806.0,199918.0,6.115


In [105]:
birth_weight.head(2)

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
0,10,184167,2103-06-28 10:00:00,1.385,2103-06-28 11:36:00
1,39,106266,2114-11-29 22:00:00,2.775,2114-11-29 21:04:00


In [106]:
birth_weight.columns 

Index(['subject_id', 'HADM_ID', 'CHARTTIME', 'VALUENUM', 'ADMITTIME'], dtype='object')

In [107]:
birth_weight.drop(['subject_id', 'CHARTTIME', 'ADMITTIME'], axis=1, inplace=True)

In [108]:
birth_weight.head()

Unnamed: 0,HADM_ID,VALUENUM
0,184167,1.385
1,106266,2.775
2,104518,3.115
3,190201,1.835
4,156857,2.685


In [109]:
birth_weight.shape 

(3135, 2)

#### Save to File 

In [110]:
birth_weight.to_csv("FinalData/SingleValue/birth_weight.csv", index=False)

### 1.2 Head Circ  

In [100]:
head_circ = pd.read_csv(directory + "head_circ_24.csv")

head_circ.head(2) 

Unnamed: 0,HADM_ID,CHARTTIME,VALUENUM,subject_id,ADMITTIME
0,184167,2103-06-28 10:00:00,27.5,10,2103-06-28 11:36:00
1,106266,2114-11-29 22:00:00,34.0,39,2114-11-29 21:04:00


In [101]:
head_circ = drop_defected_neonates_rows(head_circ) 

HADM_ID       0    HADM_ID       3135
CHARTTIME     0    CHARTTIME     3135
VALUENUM      0    VALUENUM      3135
subject_id    0    subject_id    3135
ADMITTIME     0    ADMITTIME     3135
dtype: int64       dtype: int64      



In [24]:
head_circ.describe()

Unnamed: 0,HADM_ID,VALUENUM,subject_id
count,3137.0,3137.0,3137.0
mean,150236.069812,30.586959,15404.719158
std,28969.007466,3.602091,9147.207164
min,100029.0,0.0,10.0
25%,125184.0,28.5,7506.0
50%,149876.0,31.0,15249.0
75%,175757.0,33.0,23069.0
max,199918.0,49.0,32806.0


In [111]:
head_circ.head(2)

Unnamed: 0,HADM_ID,CHARTTIME,VALUENUM,subject_id,ADMITTIME
0,184167,2103-06-28 10:00:00,27.5,10,2103-06-28 11:36:00
1,106266,2114-11-29 22:00:00,34.0,39,2114-11-29 21:04:00


In [112]:
head_circ.shape 

(3135, 5)

In [113]:
head_circ.drop(['subject_id', 'CHARTTIME', 'ADMITTIME'], axis=1, inplace=True)

In [114]:
head_circ.head(2)

Unnamed: 0,HADM_ID,VALUENUM
0,184167,27.5
1,106266,34.0


#### Save to File 

In [115]:
head_circ.to_csv("FinalData/SingleValue/head_circ.csv", index=False)

## Blood Test 

* bands
* monos 
* eosinophils
* lymphs 
* neuts 
* platelet

In [49]:
neonates = (head_circ['HADM_ID']).to_frame()

In [50]:
neonates.shape 

(3137, 1)

### 2.1 Bands 

In [26]:
bands = pd.read_csv(directory + "bands_24.csv") 

bands.head(2)

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
0,10824,143406,2126-06-12 22:29:00,0.0,2126-06-12 22:08:00
1,10826,127342,2182-07-17 03:20:00,0.0,2182-07-17 03:35:00


In [27]:
bands = drop_defected_neonates_rows(bands) 

subject_id    0    subject_id    3407
HADM_ID       0    HADM_ID       3407
CHARTTIME     0    CHARTTIME     3407
VALUENUM      0    VALUENUM      3407
ADMITTIME     0    ADMITTIME     3407
dtype: int64       dtype: int64      



In [28]:
bands['HADM_ID'].nunique()

3137

In [51]:
neonates = merge_blood_test(neonates, bands, "BANDS") 

BANDS is added!
Unique Neonates: 3137
HADM_ID    0    HADM_ID    3137
BANDS      0    BANDS      3137
dtype: int64    dtype: int64   



### 2.2 MONOs

In [29]:
monos = pd.read_csv(directory + "monos_24.csv") 

monos.head(2)

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
0,10824,143406,2126-06-12 22:29:00,3.0,2126-06-12 22:08:00
1,10826,127342,2182-07-17 03:20:00,11.0,2182-07-17 03:35:00


In [30]:
monos = drop_defected_neonates_rows(monos) 

subject_id    0    subject_id    3411
HADM_ID       0    HADM_ID       3411
CHARTTIME     0    CHARTTIME     3411
VALUENUM      1    VALUENUM      3410
ADMITTIME     0    ADMITTIME     3411
dtype: int64       dtype: int64      



In [31]:
monos['HADM_ID'].nunique()

3137

In [32]:
monos[monos['VALUENUM'].isnull() == True]

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
1454,23923,121852,2156-04-18 11:20:00,,2156-04-18 12:07:00


In [33]:
monos[monos['HADM_ID'] == 121852]

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
1453,23923,121852,2156-04-18 11:45:00,7.0,2156-04-18 12:07:00
1454,23923,121852,2156-04-18 11:20:00,,2156-04-18 12:07:00


In [36]:
monos.dropna(inplace=True)

In [37]:
monos[monos['HADM_ID'] == 121852]

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
1453,23923,121852,2156-04-18 11:45:00,7.0,2156-04-18 12:07:00


In [52]:
neonates = merge_blood_test(neonates, monos, "MONOs") 

MONOs is added!
Unique Neonates: 3137
HADM_ID    0    HADM_ID    3137
BANDS      0    BANDS      3137
MONOs      0    MONOs      3137
dtype: int64    dtype: int64   



### 2.3 Eosinophils

In [38]:
eosinophils = pd.read_csv(directory + "eosinophils_24.csv") 

eosinophils.head(2)

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
0,10824,143406,2126-06-12 22:29:00,2.0,2126-06-12 22:08:00
1,10826,127342,2182-07-17 03:20:00,2.0,2182-07-17 03:35:00


In [53]:
eosinophils = drop_defected_neonates_rows(eosinophils) 

subject_id    0    subject_id    3411
HADM_ID       0    HADM_ID       3411
CHARTTIME     0    CHARTTIME     3411
VALUENUM      1    VALUENUM      3410
ADMITTIME     0    ADMITTIME     3411
dtype: int64       dtype: int64      



In [54]:
eosinophils[eosinophils.VALUENUM.isnull() == True]

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
1454,23923,121852,2156-04-18 11:20:00,,2156-04-18 12:07:00


In [55]:
eosinophils[eosinophils.HADM_ID == 121852]

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
1453,23923,121852,2156-04-18 11:45:00,3.0,2156-04-18 12:07:00
1454,23923,121852,2156-04-18 11:20:00,,2156-04-18 12:07:00


In [56]:
eosinophils.dropna(inplace=True)

In [57]:
neonates = merge_blood_test(neonates, eosinophils, "EOSINOPHILS") 

EOSINOPHILS is added!
Unique Neonates: 3137
HADM_ID        0    HADM_ID        3137
BANDS          0    BANDS          3137
MONOs          0    MONOs          3137
EOSINOPHILS    0    EOSINOPHILS    3137
dtype: int64        dtype: int64       



### 2.4 NEUTS 

In [58]:
neuts = pd.read_csv(directory + "neuts_24.csv") 

neuts.head(2)

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
0,10824,143406,2126-06-12 22:29:00,15.0,2126-06-12 22:08:00
1,10826,127342,2182-07-17 03:20:00,13.0,2182-07-17 03:35:00


In [59]:
neuts = drop_defected_neonates_rows(neuts) 

subject_id    0    subject_id    3411
HADM_ID       0    HADM_ID       3411
CHARTTIME     0    CHARTTIME     3411
VALUENUM      1    VALUENUM      3410
ADMITTIME     0    ADMITTIME     3411
dtype: int64       dtype: int64      



In [60]:
neuts[neuts.VALUENUM.isnull() == True]

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
1454,23923,121852,2156-04-18 11:20:00,,2156-04-18 12:07:00


In [61]:
neuts[neuts.HADM_ID == 121852]

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
1453,23923,121852,2156-04-18 11:45:00,21.0,2156-04-18 12:07:00
1454,23923,121852,2156-04-18 11:20:00,,2156-04-18 12:07:00


In [62]:
neuts.dropna(inplace=True)

In [63]:
neonates = merge_blood_test(neonates, neuts, "NEUTS") 

NEUTS is added!
Unique Neonates: 3137
HADM_ID        0    HADM_ID        3137
BANDS          0    BANDS          3137
MONOs          0    MONOs          3137
EOSINOPHILS    0    EOSINOPHILS    3137
NEUTS          0    NEUTS          3137
dtype: int64        dtype: int64       



### 2.5 Lymphs

In [64]:
lymphs = pd.read_csv(directory + "lymphs_24.csv") 

lymphs.head(2)

Unnamed: 0.1,Unnamed: 0,ROW_ID,subject_id,HADM_ID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,ADMITTIME
0,33345594,90068485,10824,143406,2126-06-12 22:29:00,80,80.0,,2126-06-12 22:08:00
1,33337240,90086386,10826,127342,2182-07-17 03:20:00,72,72.0,,2182-07-17 03:35:00


In [66]:
lymphs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3422 entries, 0 to 3421
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  3422 non-null   int64  
 1   ROW_ID      3422 non-null   int64  
 2   subject_id  3422 non-null   int64  
 3   HADM_ID     3422 non-null   int64  
 4   CHARTTIME   3422 non-null   object 
 5   VALUE       3422 non-null   object 
 6   VALUENUM    3421 non-null   float64
 7   VALUEUOM    0 non-null      float64
 8   ADMITTIME   3422 non-null   object 
dtypes: float64(2), int64(4), object(3)
memory usage: 240.7+ KB


In [67]:
lymphs = drop_defected_neonates_rows(lymphs) 

Unnamed: 0       0    Unnamed: 0    3411
ROW_ID           0    ROW_ID        3411
subject_id       0    subject_id    3411
HADM_ID          0    HADM_ID       3411
CHARTTIME        0    CHARTTIME     3411
VALUE            0    VALUE         3411
VALUENUM         1    VALUENUM      3410
VALUEUOM      3411    VALUEUOM         0
ADMITTIME        0    ADMITTIME     3411
dtype: int64          dtype: int64      



In [65]:
lymphs['HADM_ID'].nunique()

3145

In [69]:
lymphs[lymphs.VALUENUM.isnull() == True]

Unnamed: 0.1,Unnamed: 0,ROW_ID,subject_id,HADM_ID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,ADMITTIME
1454,33340782,154685036,23923,121852,2156-04-18 11:20:00,ERROR,,,2156-04-18 12:07:00


In [70]:
lymphs[lymphs.HADM_ID == 121852]

Unnamed: 0.1,Unnamed: 0,ROW_ID,subject_id,HADM_ID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,ADMITTIME
1453,33340781,154685073,23923,121852,2156-04-18 11:45:00,69,69.0,,2156-04-18 12:07:00
1454,33340782,154685036,23923,121852,2156-04-18 11:20:00,ERROR,,,2156-04-18 12:07:00


In [71]:
lymphs.drop(['Unnamed: 0', 'ROW_ID', 'VALUE', 'VALUEUOM'], axis=1, inplace=True)

lymphs.head(2)

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
0,10824,143406,2126-06-12 22:29:00,80.0,2126-06-12 22:08:00
1,10826,127342,2182-07-17 03:20:00,72.0,2182-07-17 03:35:00


In [72]:
lymphs.dropna(inplace=True)

In [73]:
lymphs[lymphs.VALUENUM.isnull() == True]

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME


In [74]:
lymphs['HADM_ID'].nunique()

3137

In [75]:
neonates = merge_blood_test(neonates, lymphs, "LYMPHS") 

LYMPHS is added!
Unique Neonates: 3137
HADM_ID        0    HADM_ID        3137
BANDS          0    BANDS          3137
MONOs          0    MONOs          3137
EOSINOPHILS    0    EOSINOPHILS    3137
NEUTS          0    NEUTS          3137
LYMPHS         0    LYMPHS         3137
dtype: int64        dtype: int64       



### 2.6 PLATELET

In [83]:
platelet = pd.read_csv(directory + "platelet_24.csv") 

platelet.head(2)

Unnamed: 0.1,Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
0,33389086,10824,143406,2126-06-12 22:29:00,309.0,2126-06-12 22:08:00
1,33378634,10826,127342,2182-07-17 03:20:00,340.0,2182-07-17 03:35:00


In [84]:
platelet.drop(['Unnamed: 0'], axis=1, inplace=True)

In [85]:
platelet.head(2)

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
0,10824,143406,2126-06-12 22:29:00,309.0,2126-06-12 22:08:00
1,10826,127342,2182-07-17 03:20:00,340.0,2182-07-17 03:35:00


In [86]:
platelet = drop_defected_neonates_rows(platelet) 

subject_id    0    subject_id    3578
HADM_ID       0    HADM_ID       3578
CHARTTIME     0    CHARTTIME     3578
VALUENUM      5    VALUENUM      3573
ADMITTIME     0    ADMITTIME     3578
dtype: int64       dtype: int64      



In [81]:
platelet = drop_defected_neonates_rows(platelet) 

subject_id    0    subject_id    3573
HADM_ID       0    HADM_ID       3573
CHARTTIME     0    CHARTTIME     3573
VALUENUM      0    VALUENUM      3573
ADMITTIME     0    ADMITTIME     3573
dtype: int64       dtype: int64      



In [89]:
platelet['HADM_ID'].nunique()

3137

In [87]:
platelet[platelet.VALUENUM.isnull() == True]

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
380,13894,120546,2198-02-12 22:05:00,,2198-02-12 21:39:00
1532,23923,121852,2156-04-18 11:20:00,,2156-04-18 12:07:00
2200,31538,114445,2101-01-13 11:00:00,,2101-01-13 10:37:00
2246,32148,171619,2111-07-06 10:40:00,,2111-07-06 08:58:00
3224,7579,157469,2174-12-01 02:00:00,,2174-12-01 01:34:00


In [93]:
platelet[platelet.HADM_ID == 157469]

Unnamed: 0,subject_id,HADM_ID,CHARTTIME,VALUENUM,ADMITTIME
3224,7579,157469,2174-12-01 02:00:00,,2174-12-01 01:34:00


In [95]:
platelet.dropna(inplace=True)

In [96]:
platelet['HADM_ID'].nunique()

3135

### Fix 

In [97]:
neonates = drop_defected_neonates_rows(neonates) 

HADM_ID        0    HADM_ID        3135
BANDS          0    BANDS          3135
MONOs          0    MONOs          3135
EOSINOPHILS    0    EOSINOPHILS    3135
NEUTS          0    NEUTS          3135
LYMPHS         0    LYMPHS         3135
dtype: int64        dtype: int64       



In [98]:
neonates = merge_blood_test(neonates, platelet, "PLATELET") 

PLATELET is added!
Unique Neonates: 3135
HADM_ID        0    HADM_ID        3135
BANDS          0    BANDS          3135
MONOs          0    MONOs          3135
EOSINOPHILS    0    EOSINOPHILS    3135
NEUTS          0    NEUTS          3135
LYMPHS         0    LYMPHS         3135
PLATELET       0    PLATELET       3135
dtype: int64        dtype: int64       



In [99]:
neonates.head()

Unnamed: 0,HADM_ID,BANDS,MONOs,EOSINOPHILS,NEUTS,LYMPHS,PLATELET
0,184167,0.0,11.0,3.0,27.0,58.0,414.0
1,106266,0.0,8.0,0.0,29.0,62.0,347.0
2,104518,3.0,3.0,1.0,23.0,67.0,143.0
3,190201,1.0,8.0,1.0,33.0,57.0,188.0
4,156857,3.0,5.0,3.0,36.5,51.0,176.5


In [116]:
neonates.shape 

(3135, 7)

In [117]:
neonates.to_csv("FinalData/SingleValue/blood_test_features.csv", index=False)