In [3]:
import numpy as np
import pandas as pd
from pre_processing.db_tools import connection

In [4]:
import sqlalchemy as db
from sqlalchemy import create_engine

sepsis_admissions = pd.read_sql("select * from mimic_db.mimiciii.d_labitems", connection)

In [5]:
columnNames = ["row_id", "subject_id", "hadm_id", "itemid", "charttime", "value", "valuenum",
              "valueuom", "flag"]
rawData = pd.read_csv("mimic_db_mimiciii_sepsis_lab_events.csv", names = columnNames)

In [6]:
rawData = rawData.astype({'value': str, 'valueuom': str, 'flag': str})

In [7]:
rawData['charttime'] = pd.to_datetime(rawData['charttime'])

In [8]:
rawData

Unnamed: 0,row_id,subject_id,hadm_id,itemid,charttime,value,valuenum,valueuom,flag
0,23565441,66745,192825,51237,2145-11-16 15:00:00,1.0,1.0,,
1,23565442,66745,192825,51248,2145-11-16 15:00:00,31.4,31.4,pg,
2,23565443,66745,192825,51249,2145-11-16 15:00:00,35.5,35.5,%,abnormal
3,23565444,66745,192825,51250,2145-11-16 15:00:00,88,88.0,fL,
4,23565445,66745,192825,51265,2145-11-16 15:00:00,55,55.0,K/uL,abnormal
...,...,...,...,...,...,...,...,...,...
3466296,23565436,66745,192825,50986,2145-11-16 15:00:00,LESS THAN 2.0,,ng/mL,
3466297,23565437,66745,192825,51006,2145-11-16 15:00:00,19,19.0,mg/dL,
3466298,23565438,66745,192825,51214,2145-11-16 15:00:00,302,302.0,mg/dL,
3466299,23565439,66745,192825,51221,2145-11-16 15:00:00,33.9,33.9,%,abnormal


In [9]:
len(rawData['hadm_id'].unique())

4134

In [102]:
# Lookup table to convert lab itemid's to a specific feature
# Metavision data only - 2077 cases
featureIds = [
    51221, # Hematocrit
    51301, # White Blood Cell Blood
    51516, # WBC Urine
    51222, #Hemoglobin
    51250, #MCV
    51249, #MCHC
    51277, #RDW
    51265, #Platelet Count
    51248, #MCH
    51279, #Red Blood Cell Blood
    51493, # RBC Urine
    50971, #Potassium
    50983, #Sodium
    50902, #Chloride
    50882, #Bicarbonate
    50868, #Anion Gap
    51006, #Urea Nitrogen
    50912, #Creatinine
    50893, #Calcium, Total
    51146, #Basophils
    51256, #Neutrophils
    51244, #Lymphocytes
    51254, #Monocytes
    51200, #Eosinophils
    50970, #Phosphate
    50931, #Glucose Blood
    51478, #Glucose Urine
    50960, #Magnesium
    51237, #INR(PT)
    51274, #PT
    51275, #PTT
    50885, #Bilirubin Blood
    51464, #Bilirubin Urine
    50813, #Lactate
    50863, #Alkaline Phosphatase
    50878, #Asparate Aminotransferase (AST)
    50861, #Alanine Aminotransferase (ALT)
    51491, #pH Urine
    50820, #pH Blood
    51498, #Specific Gravity
    50821, #pO2
    50804, #Calculated Total CO2
    50802, #Base Excess
    50818, #pC02
    50862, #Albumin
    51492, #Protein
    51519, #Yeast
    51476, #Epithelial Cells
    51144, #Bands
    50954, #Lactate Dehydrogenase (LD)
    51514, #Urobilinogen
    51484, #Ketone
    50910, #Creatine Kinase (CK)
    51506, #Urine Appearance
    51486 #Leukocytes
]



def normalize_rbc_wbc_urine(val):
    special_cases = '-><'
    try:
        if any(ch in val for ch in special_cases):
            if '-' in val:
                lower_bound = float(val.split('-')[0])
                upper_bound = float(val.split('-')[1])
                return (upper_bound + lower_bound) / 2
            if '<' in val:
                return float(val.split('<')[1]) - 1
            if '>' in val:
                return float(val.split('>')[1]) + 1
    except:
        return float(val)
    
def normalize_other_urine(val):
    try:
        if val is 'NEG' or val is 'TR':
            return 0.0
        else:
            if '>' in val:
                return float(val.split('>')) + 1
            return float(val)
    except:
        return 0.0
    
def normalize_yeast(val):
    if val is 'NONE':
        return 0.0
    if val is 'RARE':
        return 1.0
    if val is 'FEW':
        return 2.0
    if val is 'OCC' or val is 'MOD':
        return 3.0
    if val is 'MANY':
        return 4.0
    return 0.0

def normalize_epithelial_cells(val):
    try:
        return normalize_rbc_wbc_urine(val)
    except:
        return 0.0

def normalize_leukocytes(val):
    if val is 'NEG':
        return 0.0
    if val is 'TR':
        return 1.0
    if val is 'SM':
        return 2.0
    if val is 'MOD':
        return 3.0
    if val is 'LG':
        return 4.0
    return 0.0

def normalize_urine_appearance(val):
    return [is_urine_clear(val), 
            is_urine_hazy(val),
            is_urine_cloudy(val)]
    
def is_urine_clear(val):
    desc = {'Clear', 'CLEAR', 'CL'}
    if val in desc:
        return 1.0
    else:
        return 0.0

def is_urine_hazy(val):
    desc = {'Hazy', 'SlHazy', 'HAZY', 'SLHAZY'}
    if val in desc:
        return 1.0
    else:
        return 0.0
    
def is_urine_cloudy(val):
    desc = {'Cloudy', 'SlCldy', 'SlCloudy', 'Slcldy', 'CLOUDY', 'CLO'}
    if val in desc:
        return 1.0
    else:
        return 0.0

customExtraction = {
    51516: normalize_rbc_wbc_urine,
    51493: normalize_rbc_wbc_urine,
    51478: normalize_other_urine,
    51492: normalize_other_urine,
    51519: normalize_yeast,
    51476: normalize_epithelial_cells,
    51514: normalize_other_urine,
    51484: normalize_other_urine,
    51506: normalize_urine_appearance,
    51486: normalize_leukocytes
}

def extractFeature(df):
    x = []
    df = df.set_index('itemid')
    for i, feature in enumerate(featureIds):
        val = 0
        float_val = 0.0
        
        try:
            val = df.at[feature, 'value']
        except:
            val = 0
        
        #qResult = df.loc[df['itemid'] == feature]
        #if not qResult.empty:
         #   val = qResult.iloc[0]['value']
        try:
            if feature in customExtraction.keys():
                float_val = customExtraction[feature](val)
                if feature == 51506:
                    for one_hot in float_val:
                        x.append(one_hot)
                else:
                    x.append(float_val)
            else:
                x.append(float(val))
        except:
            x.append(0.0)
    return np.asarray(x)

In [103]:
import pickle

xTrain = {}
for patient in rawData['hadm_id'].unique():
    patientRows = rawData.loc[rawData['hadm_id'] == patient].sort_values(by=['charttime'])
    startTime = patientRows.iloc[0]['charttime']
    endTime = patientRows.iloc[len(patientRows)-1]['charttime']
    
    # build 4 hour incremental feature vectors between start and end time
    bucketedTimes = {}
    currentTime = startTime;
    index = 0
    while(currentTime < endTime):
        startInterval = currentTime;
        endInterval = currentTime + np.timedelta64(4, 'h')
        bucketInterval = patientRows.loc[patientRows['charttime'] >= startInterval].loc[patientRows['charttime'] < endInterval]
        bucketedTimes[index] = extractFeature(bucketInterval)
        currentTime += np.timedelta64(4, 'h')
        index += 1
    xTrain[patient] = bucketedTimes
    
# TODO normalize timesteps into t0, t1, t2, etc.
# TODO fix data into floats numpy float array
# TODO quantify the non-number data using either one hot or scale
# Use a pickle file to store the training set\
# with open('fname', 'wb') as f:
#    pickle.dump(xTrain, f)

with open('training_lab_data.pkl', 'wb') as f:
    pickle.dump(xTrain, f)


In [100]:
import pickle
with open('test.pkl', 'wb') as f:
    pickle.dump(xTrain, f)

{0: array([30.5, 8.0, 1.0, 10.7, 87.0, 35.0, 18.6, 77.0, 30.6, 3.49, 35.5,
        4.4, 144.0, 115.0, 19.0, 14.0, 29.0, 2.0, 8.3, 0.0, 0.0, 0.0, 0.0,
        0.0, 4.1, 115.0, 0.0, 1.8, 1.5, 16.5, 42.1, 5.8, 0.0, 2.7, 64.0,
        50.0, 25.0, 5.0, 7.31, 1.018, 97.0, 22.0, -5.0, 41.0, 2.6, 0.0,
        0.0, None, 0.0, 202.0, 0.0, 0.0, 69.0, 1.0, 0.0, 0.0, 0.0],
       dtype=object),
 1: array([ 27.4 ,   6.1 ,   0.  ,   9.5 ,  87.  ,  34.9 ,  18.3 ,  70.  ,
         30.4 ,   3.14,   0.  ,   4.3 , 144.  , 118.  ,  18.  ,  12.  ,
         27.  ,   1.8 ,   7.5 ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          3.8 , 113.  ,   0.  ,   2.6 ,   1.6 ,  18.2 ,  45.  ,   5.1 ,
          0.  ,   2.7 ,   0.  ,   0.  ,   0.  ,   0.  ,   7.32,   0.  ,
         95.  ,  21.  ,  -5.  ,  39.  ,   2.4 ,   0.  ,   0.  ,   0.  ,
          0.  , 185.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  ]),
 2: array([29.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  

In [59]:
patientRows.loc[patientRows['charttime'] >= start].loc[patientRows['charttime'] < end]

Unnamed: 0,row_id,subject_id,hadm_id,itemid,charttime,value,valuenum,valueuom,flag
87231,23561502,66745,192825,50970,2145-09-01 21:35:00,4.1,4.1,mg/dL,delta
87217,23561488,66745,192825,50863,2145-09-01 21:35:00,64,64.0,IU/L,
87218,23561489,66745,192825,50868,2145-09-01 21:35:00,14,14.0,mEq/L,
87219,23561490,66745,192825,50878,2145-09-01 21:35:00,50,50.0,IU/L,abnormal
87220,23561491,66745,192825,50882,2145-09-01 21:35:00,19,19.0,mEq/L,abnormal
...,...,...,...,...,...,...,...,...,...
89801,23561547,66745,192825,50821,2145-09-01 22:09:00,97,97.0,mm Hg,
89795,23561541,66745,192825,50804,2145-09-01 22:09:00,22,22.0,mEq/L,
89802,23561548,66745,192825,51082,2145-09-02 01:30:00,115,115.0,mg/dL,
89803,23561549,66745,192825,51087,2145-09-02 01:30:00,RANDOM,,,


In [11]:
patientRows['itemid'].value_counts()

50800    176
50971    175
51221    175
50983    172
50868    170
        ... 
51216      1
51208      1
51076      1
51000      1
51438      1
Name: itemid, Length: 182, dtype: int64

In [1]:
with open('training_lab_data.pkl', 'rb') as f:
    xTrain = pickle.load(f)

xTrain_array = {}
i = 0
for hadm_id in xTrain:
    try:
        xTrain_array[hadm_id] = np.stack(xTrain[hadm_id].values(), axis=0)
    except:
        s = {}
        j = 0
        for val, i in enumerate(xTrain[hadm_id].keys()):
            if len(xTrain[hadm_id][i]) == 57:
                s[i] = xTrain[hadm_id][i]
            else:
                j+=1
        if len(s) != 0:
            xTrain_array[hadm_id] = np.stack(s.values(), axis=0)
        print(str(hadm_id) + " dropped " + str(j) + " rows of " + str(len(xTrain[hadm_id].keys())))

with open('lab_event_training_set.pkl', 'wb') as f:
    pickle.dump(xTrain_array, f)

FileNotFoundError: [Errno 2] No such file or directory: 'training_lab_data.pkl'