## Import modules and load data

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import os
import h5py
from tqdm import tqdm

In [2]:
tqdm.pandas()

In [60]:
df_notes = pd.read_csv('../data/original/NOTEEVENTS.csv', low_memory=False)

In [61]:
df_icustays = pd.read_csv('../data/original/ICUSTAYS.csv')

In [62]:
multi_to_patid = pd.read_csv('../data/processed/multi_to_patid.csv')

## Filter & clean table

In [63]:
rel_cats = ['SUBJECT_ID','HADM_ID','CHARTDATE','CHARTTIME','CATEGORY','TEXT']
df_notes = df_notes[rel_cats]
# Why have notes if they have no text
df_notes.dropna(subset=['TEXT'], inplace=True)
# Make it friendly for datetime comparisons later on
df_notes.CHARTDATE = df_notes.CHARTDATE.astype('datetime64[ns]')
df_notes.CHARTTIME = df_notes.CHARTTIME.astype('datetime64[ns]')

Create multi-level index

In [64]:
df_notes.set_index(['SUBJECT_ID','HADM_ID'], inplace=True)

Clean up category names

In [65]:
df_notes.loc[df_notes.CATEGORY == 'Discharge summary', 'CATEGORY'] = 'Discharge'
df_notes.loc[df_notes.CATEGORY == 'Physician ', 'CATEGORY'] = 'Physician'
df_notes.loc[df_notes.CATEGORY == 'Respiratory ', 'CATEGORY'] = 'Respiratory'

Select common categories

In [66]:
relevant_cats = ["Discharge","Nursing/other","Radiology","Nursing","ECG","Physician","Echo","Respiratory","Nutrition","General"]
df_notes = df_notes.loc[df_notes.CATEGORY.map(lambda x: x in relevant_cats)]

## Clean up df_icustays

In [67]:
# Convert from 'object' to 'datetime'
df_icustays.INTIME = df_icustays.INTIME.astype('datetime64[ns]')
df_icustays.OUTTIME = df_icustays.OUTTIME.astype('datetime64[ns]')
# Create date columns in case row in question doesn't have a 'CHARTTIME' for comparison
df_icustays['INDATE'] = df_icustays.INTIME.dt.date.astype('datetime64[ns]')
df_icustays['OUTDATE'] = df_icustays.OUTTIME.dt.date.astype('datetime64[ns]')
# Select only the first ICU stay for a patient across all admissions
df_icustays = df_icustays.sort_values(by='INTIME').groupby(['SUBJECT_ID']).first()
# Select the minimal information needed to match ICU stays to clinical notes
df_icustays = df_icustays[["HADM_ID","ICUSTAY_ID","INTIME","INDATE","OUTTIME","OUTDATE"]]
# Impossible to verify notes belong to an ICU stay without a time
df_icustays.dropna(subset=['INTIME','OUTTIME'], inplace=True)
# Each index is a single patient's first ICU visit (we removed multiple ICU visits)
df_icustays.reset_index(inplace=True)
df_icustays.set_index(['SUBJECT_ID','HADM_ID','ICUSTAY_ID'], inplace=True)

### Limit to first icustay

In [68]:
df_notes_icu = df_icustays.copy()
df_notes_icu = df_notes_icu.join(df_notes, how='inner')

In [69]:
# Date comparison
date_val = (df_notes_icu.CHARTDATE >= df_notes_icu.INDATE) & (df_notes_icu.CHARTDATE <= df_notes_icu.OUTDATE)
# Time comparison
time_val = (df_notes_icu.CHARTTIME >= df_notes_icu.INTIME) & (df_notes_icu.CHARTTIME <= df_notes_icu.OUTTIME)
# Valid?
datetime_val = (df_notes_icu.CHARTTIME.isna() & date_val) | (~df_notes_icu.CHARTTIME.isna() & time_val) 
# Filter rows
df_notes_icu = df_notes_icu.loc[datetime_val]

### Match Cohorts to pre-selected cohort

In [70]:
multi_to_patid.set_index(['subject_id','hadm_id','icustay_id'], inplace=True)
df_notes_icu.index.names = ['subject_id','hadm_id','icustay_id']

In [71]:
df_notes_icu = multi_to_patid.join(df_notes_icu, how='left')

In [72]:
df_notes_icu.dropna(subset=['INTIME','INDATE'], inplace=True)

In [73]:
df_notes_icu.reset_index(inplace=True)
df_notes_icu.set_index('pat_id', inplace=True)
df_notes_icu.drop(['subject_id','hadm_id','icustay_id'], axis=1, inplace=True)

### Split Static from Time series notes

In [74]:
df_notes_static = df_notes_icu.loc[df_notes_icu.CATEGORY == 'Discharge', :].copy()
df_notes_ts = df_notes_icu.loc[~(df_notes_icu.CATEGORY == 'Discharge'), :].copy()

### Trim static notes

In [75]:
df_notes_static["TEXT_LEN"] = df_notes_static.TEXT.map(lambda s: len(s))
df_notes_static = df_notes_static.sort_values(by='TEXT_LEN', ascending=False).groupby('pat_id').first()

In [76]:
df_notes_static = df_notes_static[['TEXT']]

### Handle ECG and Echo

In [77]:
merged_ecg = df_notes_ts.loc[df_notes_ts.CATEGORY == 'ECG'].groupby(['pat_id','CHARTDATE']).TEXT.agg(lambda s: ' [SEP] '.join(s))
merged_echo = df_notes_ts.loc[df_notes_ts.CATEGORY == 'Echo'].groupby(['pat_id','CHARTDATE']).TEXT.agg(lambda s: ' [SEP] '.join(s))

In [78]:
df_ecg = df_notes_ts.loc[df_notes_ts.CATEGORY == 'ECG']
df_ecg = df_ecg.reset_index().groupby(['pat_id','CHARTDATE']).first()
df_ecg.TEXT = merged_ecg.values
df_ecg = df_ecg.reset_index().set_index('pat_id')

In [79]:
df_echo = df_notes_ts.loc[df_notes_ts.CATEGORY == 'Echo']
df_echo = df_echo.reset_index().groupby(['pat_id','CHARTDATE']).first()
df_echo.TEXT = merged_echo.values
df_echo = df_echo.reset_index().set_index('pat_id')

In [80]:
df_dateonly = pd.concat([df_echo, df_ecg])
df_dateonly['hours_in'] = ((df_dateonly.CHARTDATE - df_dateonly.INDATE) / pd.Timedelta(hours=1)).astype(int)

### Create Hours In attribute

In [81]:
df_notes_ts = df_notes_ts.loc[~df_notes_ts.CATEGORY.map(lambda c: c in ['ECG','Echo']), :]
df_notes_ts['hours_in'] = ((df_notes_ts.CHARTTIME - df_notes_ts.INTIME) / pd.Timedelta(hours=1)).astype(int)

In [82]:
df_notes_ts = pd.concat([df_notes_ts, df_dateonly])

### Trim time series

In [83]:
df_notes_ts = df_notes_ts.reset_index().set_index(['pat_id','hours_in'])[['CATEGORY','TEXT']]

In [84]:
df_notes_ts.sort_values(by=['pat_id','hours_in'], inplace=True)

### One Hot Encode categories

In [85]:
df_notes_ts.loc[df_notes_ts.CATEGORY == 'Nursing/other', 'CATEGORY'] = 'Nursing'
df_cats = pd.get_dummies(df_notes_ts.CATEGORY, prefix='cat')
df_notes_ts = df_notes_ts.join(df_cats).drop('CATEGORY', axis=1)

### Preprocess text

In [86]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [87]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords = set(stopwords)

In [88]:
SUBTEXT_FORMAT = "\[\*\*[^\*]*\*\*\]"
NUMBER_FORMAT = "[0-9]+(,[0-9]+)*(\.[0-9]+)?"
END_OF_SEQ = "(\\\\n|\.|\?|!|\(|\))+"
PUNCTUATION = "(\\\\|/|-|'|:|#|\"|,|-|_|\+|=|%|<|>|\*|$)"

def prepare_text(s):
    # Remove dates and subtext
    s_pre = re.sub(SUBTEXT_FORMAT, '', s)
    # lowercase text
    s_pre = s_pre.lower()
    # Remove numbers
    s_pre = re.sub(NUMBER_FORMAT, '', s_pre)
    # Replace escape characters signifying end of a sequence.
    s_pre = re.sub(END_OF_SEQ, ' [SEP] ', repr(s_pre))
    # Remove multiplace consecutive spaces
    s_pre = re.sub('[ ]{2,}', ' ', s_pre)
    # Remove repetitive separations
    s_pre = re.sub(' \[SEP\] (\[SEP\] )+', ' [SEP] ', s_pre)
    # Remove punctuation
    s_pre = re.sub(PUNCTUATION, '', s_pre)
    # Remove unecessary whitespace
    s_pre = s_pre.strip()
    # Tokenize message
    s_pre = s_pre.split(' ')
    # Remove stop words
    s_pre = [token for token in s_pre if token not in stopwords and len(token) > 0]
    # Return back to a string
    s_pre = str(s_pre)[1:-1].replace('\'', '').replace(', ', ' ')
    
    return s_pre

In [89]:
df_notes_static.TEXT = df_notes_static.TEXT.progress_map(lambda t: t if pd.isnull(t) else prepare_text(t))

100%|█████████████████████████████████████████████████████████████████████████████| 4312/4312 [00:06<00:00, 629.05it/s]


In [94]:
df_notes_ts.TEXT = df_notes_ts.TEXT.progress_map(lambda t: t if pd.isnull(t) else prepare_text(t))

100%|██████████████████████████████████████████████████████████████████████| 1039995/1039995 [08:35<00:00, 2019.28it/s]


### Vocab Class

In [95]:
from typing import List
import torch
import os
import json

class Numericize(object):
    def __init__(self):
        self._tok2id = {}
        self._id2tok = {}
        self._tok2cnt = {}
        self._cnt = 1

    def add_token(self, token: str):
        if token not in self._tok2id:
            self._tok2id[token] = self._cnt
            self._id2tok[self._cnt] = token
            self._tok2cnt[token] = 1
            self._cnt += 1
        else:
            self._tok2cnt[token] += 1

    def tok2id(self, token):
        self.add_token(token)
        return self._tok2id[token]

    def numericize(self, data) -> List[int]:
        return [self.tok2id(tok) for tok in data.split(' ')]

    def to_json(self, path: str):
        vocab_data = {
            'tok2id': self._tok2id,
            'id2tok': self._id2tok,
            'tok2cnt': self._tok2cnt,
            'cnt': self._cnt
        }

        with open(path, 'w', encoding='utf-8') as f:
            json.dump(vocab_data, f, indent=4)

    def __len__(self):
        return self._cnt

### Split Notes

In [96]:
processed_dir = '../data/processed/'
train_idxs = set(np.load(os.path.join(processed_dir, 'train_idxs.npy')))
test_idxs = set(np.load(os.path.join(processed_dir, 'test_idxs.npy')))

In [97]:
static_idxs = set(df_notes_static.index.values)
ts_idxs = set(df_notes_ts.index.get_level_values(0).values)

In [98]:
train_idxs_static = list(static_idxs & train_idxs)
test_idxs_static = list(static_idxs & test_idxs)

train_idxs_ts = list(ts_idxs & train_idxs)
test_idxs_ts = list(ts_idxs & test_idxs)

In [99]:
df_static_train = df_notes_static.loc[train_idxs_static].copy()
df_static_test = df_notes_static.loc[test_idxs_static].copy()

In [100]:
df_ts_train = df_notes_ts.loc[train_idxs_ts].copy()
df_ts_test = df_notes_ts.loc[test_idxs_ts].copy()

### Convert to Numbers

In [101]:
numericize = Numericize()

In [102]:
df_static_train_num = df_static_train.TEXT.progress_map(numericize.numericize).tolist()
df_static_test_num = df_static_test.TEXT.progress_map(numericize.numericize).tolist()

100%|████████████████████████████████████████████████████████████████████████████| 3403/3403 [00:01<00:00, 2486.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 909/909 [00:00<00:00, 2352.21it/s]


In [103]:
df_ts_train_num = df_ts_train.TEXT.progress_map(numericize.numericize).tolist()
df_ts_test_num = df_ts_test.TEXT.progress_map(numericize.numericize).tolist()

100%|████████████████████████████████████████████████████████████████████████| 839543/839543 [01:46<00:00, 7862.52it/s]
100%|████████████████████████████████████████████████████████████████████████| 200452/200452 [00:27<00:00, 7323.05it/s]


In [104]:
def stringify_cat(row):
    return "".join(["1" if cat else "0" for cat in row.drop('TEXT').tolist()])

df_ts_train_cats = df_ts_train.progress_apply(stringify_cat, axis=1).tolist()
df_ts_test_cats = df_ts_test.progress_apply(stringify_cat, axis=1).tolist()

100%|████████████████████████████████████████████████████████████████████████| 839543/839543 [02:22<00:00, 5899.71it/s]
100%|████████████████████████████████████████████████████████████████████████| 200452/200452 [00:33<00:00, 6046.51it/s]


### Save Results

In [113]:
h5_static_filename = 'notes_static.h5'
h5_ts_filename = 'notes_ts.h5'

In [114]:
with h5py.File(os.path.join(processed_dir, 'train/', h5_static_filename), 'w') as train_static_file:
    for row_idx, row in enumerate(df_static_train_num):
        pat_id = train_idxs_static[row_idx]
        train_static_file.create_dataset(f'row_{pat_id}', data=np.array(row))

In [115]:
with h5py.File(os.path.join(processed_dir, 'test/', h5_static_filename), 'w') as test_static_file:
    for row_idx, row in enumerate(df_static_test_num):
        pat_id = test_idxs_static[row_idx]
        test_static_file.create_dataset(f'row_{pat_id}', data=np.array(row))

In [116]:
last_pat_id = -1
pat_group = None
group_idx = 0

with h5py.File(os.path.join(processed_dir, 'train/', h5_ts_filename), 'w') as train_ts_file:
    for (pat_id, note_time), note, cat in zip(df_ts_train.index.tolist(), df_ts_train_num, df_ts_train_cats):
        if pat_id != last_pat_id:
            pat_group = train_ts_file.create_group(f'pat_id_{pat_id}')
            group_idx = 0

        pat_group.create_dataset(f'gidx_{group_idx}_time_{note_time}_cat_{cat}', data=np.array(note))    
        
        last_pat_id = pat_id
        group_idx += 1

In [117]:
last_pat_id = -1
pat_group = None
group_idx = 0

with h5py.File(os.path.join(processed_dir, 'test/', h5_ts_filename), 'w') as test_ts_file:
    for (pat_id, note_time), note, cat in zip(df_ts_test.index.tolist(), df_ts_test_num, df_ts_test_cats):
        if pat_id != last_pat_id:
            pat_group = test_ts_file.create_group(f'pat_id_{pat_id}')
            group_idx = 0

        pat_group.create_dataset(f'gidx_{group_idx}_time_{note_time}_cat_{cat}', data=np.array(note))    
        
        last_pat_id = pat_id
        group_idx += 1

In [118]:
numericize_filepath = os.path.join(processed_dir, 'vocab.json')
numericize.to_json(numericize_filepath)