# Data Collection Master Notebook

This dataset primarily revolves around the lab events and free notes found in MIMIC-III, located in the labeevents and noteevents tables in the database.

This notebook shares our data collection methods used in each of our visualizations. Each of the group members curated a dataset to be used and each are outlined as below. However, running the notebook is not recommended as some processes take a long time to run. Just use the finished file shared in the 'Data/' folder located in this GitHub repository.

## Setup

### Imports

In [None]:
import sqlite3
import pandas as pd
import re
import json

### Sqlite3 Setup

In [None]:
#relative path to where the mimic3.db file is
db_path = '/mnt/f/mimic-iii-clinical-database-1.4/mimic3.db'
#connection object to db
sqliteConnection = sqlite3.connect(db_path)
#cursor/pointer
mimiciii = sqliteConnection.cursor()

### Helper Functions

In [None]:
def get_col_names(cursor, table_name):
    '''
    Retrieves the column names for a table in a sqlite3 db.
    ------
    cursor: sqliteConnection cursor object
    table_name: table_name to get column names for
    '''
    cursor.execute(f"""
    SELECT sql FROM sqlite_master WHERE name='{table_name}';
    """)
    
    res = mimiciii.fetchall()
    cols = re.findall(r'\"\w+\"', res[0][0])
    return [x[1:-1] for x in cols]

In [None]:
def condense_notes(admission_df, noteevents_df):
    '''
    
    '''
    admission_ids = admission_df.HADM_ID.unique()
    admission_ids.sort()

    condensed_notes = pd.DataFrame()
    condensed_notes['HADM_ID'] = admission_ids
    notes_list = []
    
    for adm_id in tqdm(admission_ids):
        curr_adm = noteevents[noteevents.HADM_ID == adm_id]
        categories = curr_adm.CATEGORY.unique()
        curr_chart = ''
        
        for category in categories:
            curr_chart += '----CATEGORY: ' + category + '----\n\n'
            curr_category_notes = curr_adm[curr_adm.CATEGORY == category][['DESCRIPTION', 'TEXT']]
            curr_descriptions = curr_category_notes.DESCRIPTION.to_list()
            curr_notes = curr_category_notes.TEXT.to_list()

            for i in range(len(curr_descriptions)):
                curr_chart += '--NEW: ' + curr_descriptions[i] + '--\n'
                curr_chart += curr_notes[i] + '\n'

            curr_chart += '\n\n'

        notes_list.append(curr_chart)
    condensed_notes['TEXT'] = notes_list
    
    return condensed_notes

In [None]:
def get_df_from_table_from_db(cursor, table_name, num_rows='*', skip_cols=[]):
    '''
    Retreives table from sqlite3 db in form of df
    ------
    cursor: sqliteConnection cursor object
    table_name: name of table to get from cursor db
    num_rows: number of rows to retrieve (or '*' for all rows)
    skip_cols: list of columns to skip in the retrieval
    '''
    col_names = get_col_names(cursor, table_name)
    
    use_cols = [col for col in col_names if col not in skip_cols]
    
    if num_rows == '*':
        query = f'''select {', '.join(use_cols)} from {table_name};'''
    else:
        query = f'''select {', '.join(use_cols)} from {table_name} limit {num_rows};'''
        
    cursor.execute(query)
    rows = cursor.fetchall()
    
    return pd.DataFrame(rows, columns=use_cols)

In [None]:
def get_tables_list_from_db(cursor):
    cursor.execute("""
    select name from sqlite-master where type='table';
    """)
    table_names = [table[0] for table in cursor.fetchall()]
    return table_names

## Get HADM_IDs associated with ARF

In [None]:
#all admissions
admission = get_df_from_table_from_db(mimiciii, 'admissions')
admission = admission.apply(lambda x: x.astype(str).str.upper())
admission.SUBJECT_ID = admission.SUBJECT_ID.astype('int64')

In [None]:
#filter for ARF-related diagnoses
arf_adm= admission[admission.DIAGNOSIS.str.contains('RESPIRATORY FAILURE') | admission.DIAGNOSIS.str.contains('RESP. FAILURE')  | admission.DIAGNOSIS.str.contains('RESP FAILURE')]
to_exclude = ['CHRONIC RESPIRATORY FAILURE;AIRWAY OBSTRUCTION', 'CHRONIC RESPIRATORY FAILURE', 'CHRONIC RESPIRATORY FAILURE; TRAC OBSTRUCTED AIRWAY']
arf_adm = arf_adm[~arf_adm.DIAGNOSIS.isin(to_exclude)]
arf_adm = arf_adm[arf_adm.HADM_ID != 0]

#get HADM_IDs for admissions associated with ARF
arf_adm_ID = arf_adm.HADM_ID.to_list()

arf_adm.head()

In [None]:
#all drg codes
drgcodes = get_df_from_table_from_db(mimiciii, 'drgcodes')
drgcodes.DESCRIPTION = drgcodes.DESCRIPTION.astype(str)

#filter for ARF
arf_drg = drgcodes[drgcodes.DESCRIPTION.str.contains('RESPIRATORY FAILURE')]
arf_drg = arf_drg[arf_drg.HADM_ID != 0]

#get HADM_IDs for drg codes associated with ARF
arf_drg_ID = arf_drg.HADM_ID.to_list()

arf_drg.head()

In [None]:
#combine drg and admission HADM_IDs associated with ARF
arf_hadm_ids = list(set(arf_drg_ID + arf_adm_ID))

#get all other diagnosis IDs
other_hadm_ids = list(set([hadm_id for hadm_id in admission.HADM_ID if hadm_id not in arf_hadm_ids]))

#save HADM_IDs associated with ARF to 'arf_hadm_ids.json' and OTHER diagnoses to 'other_hadm_ids.json'
with open('Data/arf_hadm_ids.json', 'w') as j_file:
    json.dump(arf_hadm_ids, j_file, indent=4)

with open('Data/other_hadm_ids.json', 'w') as j_file:
    json.dump(other_hadm_ids, j_file, indent=4)

## Govinda

To process 'Data/DataViz_Project_Data-Frame_Govinda.csv' file.

In [None]:
json_path = 'Data/arf_hadm_ids.json'

with open(json_path) as f:
    arf_hadm_ids = json.load(f)

# Convert 'hadm_id' values to integers
arf_hadm_ids = list(map(int, arf_hadm_ids))

# Load labevents CSV data
#labevents_df = pd.read_csv('LABEVENTS.csv')
labevents_df = get_df_from_table_from_db(mimiciii, 'labevents')

# Handle non-finite values in 'hadm_id' column
labevents_df['hadm_id'] = pd.to_numeric(labevents_df['hadm_id'], errors='coerce')
labevents_df = labevents_df.dropna(subset=['hadm_id'])

# Convert 'hadm_id' column to integers
labevents_df['hadm_id'] = labevents_df['hadm_id'].astype(int)

# Filter labevents_df based on arf_hadm_ids
arf_dataframe = labevents_df[labevents_df['hadm_id'].isin(arf_hadm_ids)]

# Save the filtered data to a CSV file
arf_csv_path = 'Data/arf_dataframe.csv'
#arf_dataframe.to_csv(arf_csv_path, index=False)

# Read the arf dataset 
#arf_dataframe = pd.read_csv(arf_csv_path)

# Display the number of rows in arf_dataframe
print("Number of rows in arf_dataframe:", len(arf_dataframe))

print(arf_dataframe.head(5))

In [None]:
# Extract data for patients with diagnoses other than ARF
other_diagnoses = labevents_df[~labevents_df['hadm_id'].isin(arf_hadm_ids)]
other_diagnoses

# Save the data for other diagnoses to a CSV file
other_diagnoses_csv_path = 'Data/other_diagnoses.csv'
#other_diagnoses.to_csv(other_diagnoses_csv_path, index=False)

In [None]:
# Drop blank values in 'valuenum' column
arf_dataframe['valuenum'].dropna(inplace=True)

# Group by 'itemid' and calculate the average of 'valuenum'
arf_average_value = arf_dataframe.groupby('itemid')['valuenum'].mean().reset_index()

# Add a new column 'diagnosis' with the label 'arf' for each row
arf_average_value['diagnosis'] = ['arf' for x in range(len(arf_average_value))]

# Save the average values to a CSV file
average_csv_path = 'Data/arf_average_value.csv'
#arf_average_value.to_csv(average_csv_path, index=False)

# Read the average values dataset
#arf_average_value = pd.read_csv(average_csv_path)

# Display the resulting DataFrame with average values
arf_average_value

In [None]:
# Drop blank values in 'valuenum' column for other diagnoses
other_diagnoses['valuenum'].dropna(inplace=True)

# Group by 'itemid' and calculate the average of 'valuenum'
other_diagnoses_average_value = other_diagnoses.groupby('itemid')['valuenum'].mean().reset_index()

# Add a new column 'diagnosis' with the label 'other' for each row
other_diagnoses_average_value['diagnosis'] = ['other' for x in range(len(other_diagnoses_average_value))]

# Save the average values to a CSV file for other diagnoses
other_average_csv_path = 'Data/other_diagnoses_average_value.csv'
#other_diagnoses_average_value.to_csv(other_average_csv_path, index=False)

# Read the average values dataset for other diagnoses
#other_diagnoses_average_value = pd.read_csv(other_average_csv_path)

# Display the resulting DataFrame with average values for other diagnoses
other_diagnoses_average_value

In [None]:
# Concatenate the ARF average values and other diagnoses average values
combined_df = pd.concat([arf_average_value, other_diagnoses_average_value], ignore_index=True)
combined_df = pd.pivot_table(combined_df, index='itemid', columns='diagnosis', values='valuenum', aggfunc='first')
combined_df.reset_index(inplace=True)
combined_df.columns.name = None
combined_df.columns = ['itemid', 'arf', 'other']
#Save the combined DataFrame to a CSV file
combined_csv_path = 'Data/combined_df.csv'
#combined_df.to_csv(combined_csv_path, index=False)

# Read the combined dataset
#combined_df = pd.read_csv(combined_csv_path)

# Display the resulting combined DataFrame
combined_df

In [None]:
# Reading Lab Items Data
#labitems_df = pd.read_csv('D_LABITEMS.csv')
labitems_df = get_df_from_table_from_db(mimiciii, table_name='d_labitems')
labitems_df

In [None]:
#merge with Lab Item data
DataFrame = combined_df.merge(labitems_df[['itemid', 'label']], on='itemid', how='left')

# Display the resulting DataFrame with selected column
DataFrame[['itemid', 'label', 'arf', 'other']]

In [None]:
# Save the DataFrame to a CSV file
DataFrame_csv_path = 'Data/DataViz_Project_DataFrame_Govinda.csv'
DataFrame.to_csv(DataFrame_csv_path, index=False)

## Ed

In [None]:
#Extract admission table with designated attributes
admission = get_df_from_table_from_db(mimiciii, 'admissions')
admission = admission.apply(lambda x: x.astype(str).str.upper())
admission.HADM_ID = admission.HADM_ID.astype('int64')
admission.SUBJECT_ID = admission.SUBJECT_ID.astype('int64')

admission = admission[['HADM_ID','SUBJECT_ID','DIAGNOSIS']]
admission.head()

In [None]:
#Import json for HADM_ID list of arf diagnosis
arf_data = []
with open('arf_hadm_ids.json') as json_file:
   arf_data = json.load(json_file)
admission.loc[admission['HADM_ID'].isin(list(map(int, arf_data)))]

In [None]:
#Mark arf as arf, and the rest as other
admission['WR_DIAGNOSIS'] = np.full(len(admission),"Other")
admission.loc[admission['HADM_ID'].isin(list(map(int, arf_data))),'WR_DIAGNOSIS'] = "ARF" 
admission['WR_DIAGNOSIS'].value_counts()

In [None]:
# Read the labevents table, which is huge
# labevents = get_df_from_table_from_db(mimiciii, 'labevents')
# labevents = labevents[~np.isnan(labevents['HADM_ID'])]
# labevents.HADM_ID = labevents.HADM_ID.astype('int64')
# labevents.to_csv("temp_labevent.csv",index=False)
# labevents.head()

In [None]:
# Read the labitems table for ITEMID-test name mapping
test_names = get_df_from_table_from_db(mimiciii, 'd_labitems')
test_names.head()

In [None]:
# There are too many tests types and not all of them are performed for a HADM_ID
# So instead, we pick the most common 10 tests to decrease the number of dropped tests when doing dropna 
most_common_tests = labevents.groupby('ITEMID').count().sort_values(by='HADM_ID', ascending=False)[0:10]
most_common_tests = pd.DataFrame({'ITEMID':most_common_tests.index,'COUNT':most_common_tests['HADM_ID']})
most_common_tests.reset_index(drop=True,inplace=True)
most_common_tests = most_common_tests.merge(test_names,on='ITEMID',how='left')[['ITEMID','LABEL']]
most_common_tests

In [None]:
# Filter the labevents according to the most common tests
labevents = labevents[labevents['ITEMID'].isin(most_common_tests['ITEMID'].unique())]
labevents

In [None]:
# Align two tables
labevents = labevents[labevents['HADM_ID'].isin(admission['HADM_ID'].unique())]
admission = admission[admission['HADM_ID'].isin(labevents['HADM_ID'].unique())]
print(len(labevents['HADM_ID'].unique()))
print(len(admission['HADM_ID'].unique()))

In [None]:
# Make columns for most common tests' values and abnormalities
import warnings
warnings.filterwarnings("ignore")
for name in most_common_tests['LABEL']:
    admission.loc[:,name] = np.full(len(admission),"")
    admission.loc[:,name+"_ab"] = np.full(len(admission),False)
admission

In [None]:
# Collect values form labevents and fill them into admission dataframe
for id in admission['HADM_ID']:
    temp = labevents[labevents['HADM_ID'] == id]
    for item in most_common_tests['ITEMID']:
        row = admission['HADM_ID'] == id
        col = most_common_tests[most_common_tests['ITEMID'] == item]['LABEL'].values[0]
        if len(temp[temp['ITEMID'] == item]['VALUE'].values) <= 0:
            admission.loc[row, col] = np.NaN
            admission.loc[row, col+'_ab'] = np.NaN
        else:
            admission.loc[row, col] = temp[temp['ITEMID'] == item]['VALUE'].values[0]
            admission.loc[row, col+'_ab'] = temp[temp['ITEMID'] == item]['FLAG'].values[0] == "abnormal"

In [None]:
# Save the data for later use
admission.to_csv("Data/ED_Wrangling_Result.csv",index=False)

## Kolton

In [None]:
#get all noteevents
noteevents = get_df_from_table_from_db(mimiciii, 'noteevents')
noteevents.HADM_ID = noteevents.HADM_ID.apply(fix_dot_zero)

In [None]:
#condense/combine all notes associated with single HADM_ID into one note
condensed_notes = condense_notes(admission, noteevents)
for index, row in condensed_notes.iterrows():
    with open(f'all_notes/{row["HADM_ID"]}.txt', 'w') as f:
        f.write(row['TEXT'])

#save that file
condensed_notes.to_csv('Data/all_notes_raw.csv', index=False)

In [None]:
subset = condensed_notes.head(500)
subset.to_csv('Data/subset_notes_raw.csv')