In [6]:
import pandas as pd
import json
import sqlite3
import re

In [7]:
def get_col_names(cursor, table_name):
    cursor.execute(f"""
    SELECT sql FROM sqlite_master WHERE name='{table_name}';
    """)
    
    res = mimiciii.fetchall()
    cols = re.findall(r'\"\w+\"', res[0][0])
    return [x[1:-1] for x in cols]

def get_df_from_table_from_db(cursor, table_name, num_rows='*', skip_cols=[]):
    '''
    cursor: sqliteConnection cursor object
    table_name: name of table to get from cursor db
    num_rows: number of rows to retrieve (or '*' for all rows)
    skip_cols: list of columns to skip in the retrieval
    '''
    col_names = get_col_names(cursor, table_name)
    
    use_cols = [col for col in col_names if col not in skip_cols]
    
    if num_rows == '*':
        query = f'''select {', '.join(use_cols)} from {table_name};'''
    else:
        query = f'''select {', '.join(use_cols)} from {table_name} limit {num_rows};'''
        
    cursor.execute(query)
    rows = cursor.fetchall()
    
    return pd.DataFrame(rows, columns=use_cols, dtype=str)

In [8]:
with open('../Data/arf_hadm_ids.json', 'r') as j_file:
    arf_hadm_ids = json.load(j_file)

with open('../Data/other_hadm_ids.json', 'r') as j_file:
    other_hadm_ids = json.load(j_file)

In [9]:
print(len(arf_hadm_ids))
print(len(other_hadm_ids))

751
58501


In [10]:
diagnosis_map = {arf_hadm_id: 'ARF' for arf_hadm_id in arf_hadm_ids}
diagnosis_map.update({other_hadm_id: 'OTHER' for other_hadm_id in other_hadm_ids})

In [11]:
db_path = '/mnt/f/mimic-iii-clinical-database-1.4/mimic3.db'
sqliteConnection = sqlite3.connect(db_path)
mimiciii = sqliteConnection.cursor()

In [12]:
admissions = get_df_from_table_from_db(mimiciii, 'admissions')
admissions['CUSTOM_DIAGNOSIS'] = admissions.HADM_ID.map(diagnosis_map)
admissions.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,CUSTOM_DIAGNOSIS
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1,OTHER
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1,OTHER
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1,OTHER
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1,OTHER
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1,OTHER


In [16]:
labs = pd.read_csv('../Data/ED_Wrangling_Result.csv', dtype=str)
labs.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,DIAGNOSIS,WR_DIAGNOSIS,Hematocrit,Hematocrit_ab,Potassium,Potassium_ab,Sodium,Sodium_ab,...,Bicarbonate,Bicarbonate_ab,Urea Nitrogen,Urea Nitrogen_ab,Anion Gap,Anion Gap_ab,Platelet Count,Platelet Count_ab,Glucose,Glucose_ab
0,165315,22,BENZODIAZEPINE OVERDOSE,Other,35.9,True,4.4,False,140,False,...,28,False,17,False,13,False,259,False,102,False
1,152223,23,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,Other,21.9,True,3.9,False,143,False,...,23,False,14,False,12,False,95,True,128,True
2,124321,23,BRAIN MASS,Other,38.9,True,4.2,False,140,False,...,27,False,16,False,13,False,216,False,141,True
3,161859,24,INTERIOR MYOCARDIAL INFARCTION,Other,40.4,False,4.1,False,139,False,...,24,False,13,False,14,False,215,False,100,False
4,129635,25,ACUTE CORONARY SYNDROME,Other,37.9,True,3.3,False,134,False,...,23,False,50,True,19,False,269,False,378,True


In [17]:
labs.HADM_ID.dtype

dtype('O')

In [52]:
pcp_data = pd.merge(admissions, labs, on=['HADM_ID', 'SUBJECT_ID', 'DIAGNOSIS'], how='left')
pcp_data.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,Bicarbonate,Bicarbonate_ab,Urea Nitrogen,Urea Nitrogen_ab,Anion Gap,Anion Gap_ab,Platelet Count,Platelet Count_ab,Glucose,Glucose_ab
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,...,28,False,17,False,13,False,259,False,102,False
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,...,23,False,14,False,12,False,95,True,128,True
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,...,27,False,16,False,13,False,216,False,141,True
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,...,24,False,13,False,14,False,215,False,100,False
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,...,23,False,50,True,19,False,269,False,378,True


In [53]:
pcp_data.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION',
       'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION',
       'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA', 'CUSTOM_DIAGNOSIS',
       'WR_DIAGNOSIS', 'Hematocrit', 'Hematocrit_ab', 'Potassium',
       'Potassium_ab', 'Sodium', 'Sodium_ab', 'Chloride', 'Chloride_ab',
       'Creatinine', 'Creatinine_ab', 'Bicarbonate', 'Bicarbonate_ab',
       'Urea Nitrogen', 'Urea Nitrogen_ab', 'Anion Gap', 'Anion Gap_ab',
       'Platelet Count', 'Platelet Count_ab', 'Glucose', 'Glucose_ab'],
      dtype='object')

In [54]:
def get_tables_list_from_db(cursor):
    cursor.execute("""
    select name from sqlite_master where type='table';
    """)
    table_names = [table[0] for table in cursor.fetchall()]
    return table_names

In [55]:
patients = get_df_from_table_from_db(mimiciii, 'patients')
patients.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,235,250,F,2164-12-27 00:00:00,2188-11-22 00:00:00,2188-11-22 00:00:00,,1
2,236,251,M,2090-03-15 00:00:00,,,,0
3,237,252,M,2078-03-06 00:00:00,,,,0
4,238,253,F,2089-11-26 00:00:00,,,,0


In [56]:
pcp_data.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION',
       'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION',
       'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA', 'CUSTOM_DIAGNOSIS',
       'WR_DIAGNOSIS', 'Hematocrit', 'Hematocrit_ab', 'Potassium',
       'Potassium_ab', 'Sodium', 'Sodium_ab', 'Chloride', 'Chloride_ab',
       'Creatinine', 'Creatinine_ab', 'Bicarbonate', 'Bicarbonate_ab',
       'Urea Nitrogen', 'Urea Nitrogen_ab', 'Anion Gap', 'Anion Gap_ab',
       'Platelet Count', 'Platelet Count_ab', 'Glucose', 'Glucose_ab'],
      dtype='object')

In [57]:
pcp_data = pd.merge(pcp_data, patients, on='SUBJECT_ID', how='inner')
pcp_data['DOB_YEAR'] = pd.to_datetime(pcp_data['DOB']).dt.year

pcp_data.head()

Unnamed: 0,ROW_ID_x,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,Glucose,Glucose_ab,ROW_ID_y,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG,DOB_YEAR
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,...,102,False,19,F,2131-05-07 00:00:00,,,,0,2131
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,...,128,True,20,M,2082-07-17 00:00:00,,,,0,2082
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,...,141,True,20,M,2082-07-17 00:00:00,,,,0,2082
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,...,100,False,21,M,2100-05-31 00:00:00,,,,0,2100
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,...,378,True,22,M,2101-11-21 00:00:00,,,,0,2101


In [61]:
columns_to_keep = [
    'HADM_ID',
    'EXPIRE_FLAG',
    'DOB_YEAR',
    'WR_DIAGNOSIS',
    'DIAGNOSIS',
    'Hematocrit',
       'Potassium', 'Sodium', 'Chloride',
       'Creatinine', 'Bicarbonate',
       'Urea Nitrogen', 'Anion Gap',
       'Platelet Count', 'Glucose',
]

pcp_data = pcp_data[columns_to_keep]
pcp_data.head()

Unnamed: 0,HADM_ID,EXPIRE_FLAG,DOB_YEAR,WR_DIAGNOSIS,DIAGNOSIS,Hematocrit,Potassium,Sodium,Chloride,Creatinine,Bicarbonate,Urea Nitrogen,Anion Gap,Platelet Count,Glucose
0,165315,0,2131,Other,BENZODIAZEPINE OVERDOSE,35.9,4.4,140,103,0.6,28,17,13,259,102
1,152223,0,2082,Other,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,21.9,3.9,143,112,0.7,23,14,12,95,128
2,124321,0,2082,Other,BRAIN MASS,38.9,4.2,140,104,0.7,27,16,13,216,141
3,161859,0,2100,Other,INTERIOR MYOCARDIAL INFARCTION,40.4,4.1,139,105,0.9,24,13,14,215,100
4,129635,0,2101,Other,ACUTE CORONARY SYNDROME,37.9,3.3,134,95,1.6,23,50,19,269,378


In [66]:
pcp_data.to_csv('../Data/PCP_Data.csv', index=False)