In [381]:
import os

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [245]:
tqdm.pandas()

  from pandas import Panel


## Load data

In [111]:
raw_adm = pd.read_csv("/home/mcb/users/xlu41/data/MIMIC/ADMISSIONS.csv.gz")
raw_patients = pd.read_csv('/home/mcb/users/xlu41/data/MIMIC/PATIENTS.csv.gz')
dicd = pd.read_csv('/home/mcb/users/xlu41/data/MIMIC/DIAGNOSES_ICD.csv.gz')
picd = pd.read_csv('/home/mcb/users/xlu41/data/MIMIC/PROCEDURES_ICD.csv.gz')

In [2]:
raw_notes = pd.read_csv("/home/mcb/users/xlu41/data/MIMIC/NOTEEVENTS.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Clean Admission dataframe

In [112]:
# First only keep the admission IDs that are in the notes as well
adm = raw_adm.copy()
adm_in_notes = notes.HADM_ID.unique()
adm = adm[adm.HADM_ID.isin(adm_in_notes)]

## Process notes dataframe

In [224]:
notes = raw_notes.copy()
categories_keep = ['Nursing', 'Physician ', 'Nursing/other']
notes = notes[notes.CATEGORY.isin(categories_keep)]

Add information about death and discharge time

In [225]:
notes = notes.merge(
    adm[['HADM_ID', 'DISCHTIME', 'HOSPITAL_EXPIRE_FLAG']],
    on='HADM_ID', how='left'
)

Ensures to only keep notes that were written at least 24h before discharge

In [226]:
# Time manipulation
notes.DISCHTIME = pd.to_datetime(notes.DISCHTIME)
notes.CHARTTIME = pd.to_datetime(notes.CHARTTIME)
notes.CHARTDATE = pd.to_datetime(notes.CHARTDATE) + pd.DateOffset(hours=23)

notes.CHARTTIME = notes.CHARTTIME.fillna(notes.CHARTDATE)

notes = notes[notes.CHARTTIME < notes.DISCHTIME - pd.DateOffset(hours=24)]

Sample negative notes by randomly selecting at most 4 notes per patient.

In [270]:
keep_cols = ['HADM_ID', 'SUBJECT_ID', 'TEXT', 'HOSPITAL_EXPIRE_FLAG']

pos_notes = notes.loc[notes.HOSPITAL_EXPIRE_FLAG == 1, keep_cols]

neg_notes = (
    notes
    .loc[:, keep_cols]
    .query("HOSPITAL_EXPIRE_FLAG == 0")
    .groupby("HADM_ID")
    .progress_apply(lambda df: df.sample(n=4) if df.shape[0] >= 4 else df)
    .reset_index(drop=True)
)

Recombine everything to get a balanced dataset

In [293]:
sampled_notes = pd.concat([pos_notes, neg_notes]).drop_duplicates()
sampled_notes.HOSPITAL_EXPIRE_FLAG.value_counts()

0.0    135476
1.0    124702
Name: HOSPITAL_EXPIRE_FLAG, dtype: int64

## Process text content

In [301]:
def isolate(text, chars):
    for c in chars:
        text = text.replace(c, f" {c} ")
    return text

In [300]:
def replace(text, chars, new=""):
    for c in chars:
        text = text.replace(c, new)
    return text

In [323]:
def clean_text(text):
    text = replace(text, "[**")
    text = replace(text, "**]")
    text = isolate(text, "~!@#$%^&*()_+-={}:\";',./<>?\\|`'")
    text = text.lower()
    
    return text

In [325]:
sampled_notes.TEXT = sampled_notes.TEXT.progress_apply(clean_text)

HBox(children=(FloatProgress(value=0.0, max=260178.0), HTML(value='')))




## Train test split

In [359]:
subjects = sampled_notes[['SUBJECT_ID', "HOSPITAL_EXPIRE_FLAG"]].drop_duplicates()

train_subj, rest_subj = train_test_split(
    subjects, 
    test_size=0.25, 
    random_state=0,
    stratify=subjects.HOSPITAL_EXPIRE_FLAG
)

valid_subj, test_subj = train_test_split(
    rest_subj.SUBJECT_ID.values,
    test_size=0.6,
    random_state=1,
    stratify=rest_subj.HOSPITAL_EXPIRE_FLAG
)

train_subj = train_subj.SUBJECT_ID.values

In [372]:
train_notes = sampled_notes[sampled_notes.SUBJECT_ID.isin(train_subj)].reset_index(drop=True)
valid_notes = sampled_notes[sampled_notes.SUBJECT_ID.isin(valid_subj)].reset_index(drop=True)
test_notes = sampled_notes[sampled_notes.SUBJECT_ID.isin(test_subj)].reset_index(drop=True)

## Save

In [379]:
def touch_dir(dirname):
    if not os.path.exists(dirname):
        os.makedirs(dirname)
        print(f"Created directory {dirname}.")
    else:
        print(f"Directory {dirname} already exists.")

In [383]:
touch_dir("data")
train_notes.to_csv("data/train.csv", index=False)
valid_notes.to_csv("data/valid.csv", index=False)
test_notes.to_csv("data/test.csv", index=False)

Directory data already exists.
