In [1]:
import sqlite3
import pandas as pd
import re
import numpy as np 
import sys
import json

In [2]:
#relative path to where the mimic3.db file is
db_path = 'D:/mimic-iii-clinical-database-1.4/mimic3.db'
#connection object to db
sqliteConnection = sqlite3.connect(db_path)
#cursor/pointer
mimiciii = sqliteConnection.cursor()

In [3]:
#Helper functions from Kolton
def get_col_names(cursor, table_name):
    '''
    Retrieves the column names for a table in a sqlite3 db.
    ------
    cursor: sqliteConnection cursor object
    table_name: table_name to get column names for
    '''
    cursor.execute(f"""
    SELECT sql FROM sqlite_master WHERE name='{table_name}';
    """)
    
    res = mimiciii.fetchall()
    cols = re.findall(r'\"\w+\"', res[0][0])
    return [x[1:-1] for x in cols]

def get_df_from_table_from_db(cursor, table_name, num_rows='*', skip_cols=[]):
    '''
    Retreives table from sqlite3 db in form of df
    ------
    cursor: sqliteConnection cursor object
    table_name: name of table to get from cursor db
    num_rows: number of rows to retrieve (or '*' for all rows)
    skip_cols: list of columns to skip in the retrieval
    '''
    col_names = get_col_names(cursor, table_name)
    
    use_cols = [col for col in col_names if col not in skip_cols]
    
    if num_rows == '*':
        query = f'''select {', '.join(use_cols)} from {table_name};'''
    else:
        query = f'''select {', '.join(use_cols)} from {table_name} limit {num_rows};'''
        
    cursor.execute(query)
    rows = cursor.fetchall()
    
    return pd.DataFrame(rows, columns=use_cols)

def get_tables_list_from_db(cursor):
    cursor.execute("""
    select name from sqlite-master where type='table';
    """)
    table_names = [table[0] for table in cursor.fetchall()]
    return table_names

def to_int(x):
    if not x or np.isnan(x):
        return 0
    return int(x)

In [4]:
admission = get_df_from_table_from_db(mimiciii, 'admissions')
admission = admission.apply(lambda x: x.astype(str).str.upper())
admission.HADM_ID = admission.HADM_ID.astype('int64')
admission.SUBJECT_ID = admission.SUBJECT_ID.astype('int64')

admission = admission[['HADM_ID','SUBJECT_ID','DIAGNOSIS']]
admission.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,DIAGNOSIS
0,165315,22,BENZODIAZEPINE OVERDOSE
1,152223,23,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...
2,124321,23,BRAIN MASS
3,161859,24,INTERIOR MYOCARDIAL INFARCTION
4,129635,25,ACUTE CORONARY SYNDROME


In [5]:
# #Finding other potention majors
# temp_adm = admission.groupby('DIAGNOSIS').count()
# np.set_printoptions(threshold=sys.maxsize)
# other_major = np.array(temp_adm[temp_adm['HADM_ID']>1000].index)
# other_major

In [6]:
arf_data = []
with open('arf_hadm_ids.json') as json_file:
   arf_data = json.load(json_file)
admission.loc[admission['HADM_ID'].isin(list(map(int, arf_data)))]

Unnamed: 0,HADM_ID,SUBJECT_ID,DIAGNOSIS
106,175533,101,RESPIRATORY FAILURE
356,190462,501,ASTHMA-COPD EXACERBATION
361,103194,505,HYPOTENSION
407,166989,404,PNEUMONIA
575,111199,281,"PNEUMONIA,HYPONATREMIA"
...,...,...,...
58022,172731,96581,PNEUMONIA
58051,112107,91024,ASTHMA;CHRONIC OBST PULM DISEASE
58428,177517,99613,RESPIRATORY FAILURE;PNEUMONIA;CHRONIC OBST PUL...
58623,119334,97772,RESPIRATORY FAILURE


In [7]:
# admission[admission['DIAGNOSIS'].isin(other_major)]

In [8]:
#Remove minor groups
admission['WR_DIAGNOSIS'] = np.full(len(admission),"Other")
#admission.loc[admission['DIAGNOSIS'].isin(other_major),'WR_DIAGNOSIS'] = admission['DIAGNOSIS'] 
admission.loc[admission['HADM_ID'].isin(list(map(int, arf_data))),'WR_DIAGNOSIS'] = "ARF" 
#admission = admission[admission['DIAGNOSIS'].isin(other_major)]
admission

Unnamed: 0,HADM_ID,SUBJECT_ID,DIAGNOSIS,WR_DIAGNOSIS
0,165315,22,BENZODIAZEPINE OVERDOSE,Other
1,152223,23,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,Other
2,124321,23,BRAIN MASS,Other
3,161859,24,INTERIOR MYOCARDIAL INFARCTION,Other
4,129635,25,ACUTE CORONARY SYNDROME,Other
...,...,...,...,...
58971,191113,98800,TRAUMA,Other
58972,101071,98802,SAH,Other
58973,122631,98805,RENAL CANCER/SDA,Other
58974,170407,98813,S/P FALL,Other


In [9]:
# labevents = get_df_from_table_from_db(mimiciii, 'labevents')
# labevents = labevents[~np.isnan(labevents['HADM_ID'])]
# labevents.to_csv("temp_labevent.csv",index=False)
# labevents.head()

In [10]:
# temp = temp[temp['HADM_ID'].isin(admission['HADM_ID'])]
# temp

In [11]:
# temp.to_csv("labevent.csv",index=False)

In [12]:
labevents = pd.read_csv('temp_labevent.csv')  
labevents.HADM_ID = labevents.HADM_ID.astype('int64')
labevents

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,441,3,145834,50868,2101-10-20 16:40:00,17,17.0,mEq/L,
1,442,3,145834,50882,2101-10-20 16:40:00,25,25.0,mEq/L,
2,443,3,145834,50893,2101-10-20 16:40:00,8.2,8.2,mg/dL,abnormal
3,444,3,145834,50902,2101-10-20 16:40:00,99,99.0,mEq/L,abnormal
4,445,3,145834,50910,2101-10-20 16:40:00,48,48.0,IU/L,
...,...,...,...,...,...,...,...,...,...
22245029,27428435,96443,103219,50882,2109-12-30 01:40:00,26,26.0,mEq/L,
22245030,27428436,96443,103219,50885,2109-12-30 01:40:00,2.1,2.1,mg/dL,abnormal
22245031,27428437,96443,103219,50902,2109-12-30 01:40:00,97,97.0,mEq/L,
22245032,27428438,96443,103219,50911,2109-12-30 01:40:00,2,2.0,ng/mL,


In [13]:
test_names = get_df_from_table_from_db(mimiciii, 'd_labitems')
test_names

Unnamed: 0,ROW_ID,ITEMID,LABEL,FLUID,CATEGORY,LOINC_CODE
0,546,51346,Blasts,Cerebrospinal Fluid (CSF),Hematology,26447-3
1,547,51347,Eosinophils,Cerebrospinal Fluid (CSF),Hematology,26451-5
2,548,51348,"Hematocrit, CSF",Cerebrospinal Fluid (CSF),Hematology,30398-2
3,549,51349,Hypersegmented Neutrophils,Cerebrospinal Fluid (CSF),Hematology,26506-6
4,550,51350,Immunophenotyping,Cerebrospinal Fluid (CSF),Hematology,
...,...,...,...,...,...,...
748,749,51551,VOIDED SPECIMEN,OTHER BODY FLUID,HEMATOLOGY,
749,750,51552,VOIDED SPECIMEN,STOOL,CHEMISTRY,
750,751,51553,VOIDED SPECIMEN,URINE,CHEMISTRY,
751,752,51554,VOIDED SPECIMEN,JOINT FLUID,HEMATOLOGY,


In [14]:
most_common_tests = labevents.groupby('ITEMID').count().sort_values(by='HADM_ID', ascending=False)[0:10]
most_common_tests = pd.DataFrame({'ITEMID':most_common_tests.index,'COUNT':most_common_tests['HADM_ID']})
most_common_tests.reset_index(drop=True,inplace=True)
most_common_tests = most_common_tests.merge(test_names,on='ITEMID',how='left')[['ITEMID','LABEL']]
most_common_tests

Unnamed: 0,ITEMID,LABEL
0,51221,Hematocrit
1,50971,Potassium
2,50983,Sodium
3,50902,Chloride
4,50912,Creatinine
5,50882,Bicarbonate
6,51006,Urea Nitrogen
7,50868,Anion Gap
8,51265,Platelet Count
9,50931,Glucose


In [15]:
labevents = labevents[labevents['ITEMID'].isin(most_common_tests['ITEMID'].unique())]
labevents

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,441,3,145834,50868,2101-10-20 16:40:00,17,17.0,mEq/L,
1,442,3,145834,50882,2101-10-20 16:40:00,25,25.0,mEq/L,
3,444,3,145834,50902,2101-10-20 16:40:00,99,99.0,mEq/L,abnormal
6,447,3,145834,50912,2101-10-20 16:40:00,3.2,3.2,mg/dL,abnormal
7,448,3,145834,50931,2101-10-20 16:40:00,91,91.0,mg/dL,
...,...,...,...,...,...,...,...,...,...
22244959,27427228,96442,120151,51221,2115-06-29 04:45:00,29.0,29.0,%,abnormal
22245027,27428433,96443,103219,50868,2109-12-30 01:40:00,15,15.0,mEq/L,
22245029,27428435,96443,103219,50882,2109-12-30 01:40:00,26,26.0,mEq/L,
22245031,27428437,96443,103219,50902,2109-12-30 01:40:00,97,97.0,mEq/L,


In [16]:
labevents = labevents[labevents['HADM_ID'].isin(admission['HADM_ID'].unique())]
admission = admission[admission['HADM_ID'].isin(labevents['HADM_ID'].unique())]
print(len(labevents['HADM_ID'].unique()))
print(len(admission['HADM_ID'].unique()))

57219
57219


In [17]:
import warnings
warnings.filterwarnings("ignore")
for name in most_common_tests['LABEL']:
    admission.loc[:,name] = np.full(len(admission),"")
    admission.loc[:,name+"_ab"] = np.full(len(admission),False)
admission

Unnamed: 0,HADM_ID,SUBJECT_ID,DIAGNOSIS,WR_DIAGNOSIS,Hematocrit,Hematocrit_ab,Potassium,Potassium_ab,Sodium,Sodium_ab,...,Bicarbonate,Bicarbonate_ab,Urea Nitrogen,Urea Nitrogen_ab,Anion Gap,Anion Gap_ab,Platelet Count,Platelet Count_ab,Glucose,Glucose_ab
0,165315,22,BENZODIAZEPINE OVERDOSE,Other,,False,,False,,False,...,,False,,False,,False,,False,,False
1,152223,23,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,Other,,False,,False,,False,...,,False,,False,,False,,False,,False
2,124321,23,BRAIN MASS,Other,,False,,False,,False,...,,False,,False,,False,,False,,False
3,161859,24,INTERIOR MYOCARDIAL INFARCTION,Other,,False,,False,,False,...,,False,,False,,False,,False,,False
4,129635,25,ACUTE CORONARY SYNDROME,Other,,False,,False,,False,...,,False,,False,,False,,False,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,191113,98800,TRAUMA,Other,,False,,False,,False,...,,False,,False,,False,,False,,False
58972,101071,98802,SAH,Other,,False,,False,,False,...,,False,,False,,False,,False,,False
58973,122631,98805,RENAL CANCER/SDA,Other,,False,,False,,False,...,,False,,False,,False,,False,,False
58974,170407,98813,S/P FALL,Other,,False,,False,,False,...,,False,,False,,False,,False,,False


In [18]:
for id in admission['HADM_ID']:
    temp = labevents[labevents['HADM_ID'] == id]
    for item in most_common_tests['ITEMID']:
        row = admission['HADM_ID'] == id
        col = most_common_tests[most_common_tests['ITEMID'] == item]['LABEL'].values[0]
        if len(temp[temp['ITEMID'] == item]['VALUE'].values) <= 0:
            admission.loc[row, col] = np.NaN
            admission.loc[row, col+'_ab'] = np.NaN
        else:
            admission.loc[row, col] = temp[temp['ITEMID'] == item]['VALUE'].values[0]
            admission.loc[row, col+'_ab'] = temp[temp['ITEMID'] == item]['FLAG'].values[0] == "abnormal"

In [19]:
admission

Unnamed: 0,HADM_ID,SUBJECT_ID,DIAGNOSIS,WR_DIAGNOSIS,Hematocrit,Hematocrit_ab,Potassium,Potassium_ab,Sodium,Sodium_ab,...,Bicarbonate,Bicarbonate_ab,Urea Nitrogen,Urea Nitrogen_ab,Anion Gap,Anion Gap_ab,Platelet Count,Platelet Count_ab,Glucose,Glucose_ab
0,165315,22,BENZODIAZEPINE OVERDOSE,Other,35.9,True,4.4,False,140,False,...,28,False,17,False,13,False,259,False,102,False
1,152223,23,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,Other,21.9,True,3.9,False,143,False,...,23,False,14,False,12,False,95,True,128,True
2,124321,23,BRAIN MASS,Other,38.9,True,4.2,False,140,False,...,27,False,16,False,13,False,216,False,141,True
3,161859,24,INTERIOR MYOCARDIAL INFARCTION,Other,40.4,False,4.1,False,139,False,...,24,False,13,False,14,False,215,False,100,False
4,129635,25,ACUTE CORONARY SYNDROME,Other,37.9,True,3.3,False,134,False,...,23,False,50,True,19,False,269,False,378,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,191113,98800,TRAUMA,Other,43.8,False,3.3,False,142,False,...,15,True,9,False,6,True,241,False,64,True
58972,101071,98802,SAH,Other,42.2,False,,,,,...,,,20,False,,,271,False,,
58973,122631,98805,RENAL CANCER/SDA,Other,40.5,False,5.6,True,137,False,...,23,False,21,True,15,False,273,False,131,True
58974,170407,98813,S/P FALL,Other,29.3,True,3.9,False,144,False,...,23,False,5,True,14,False,272,False,148,True
