In [None]:
import sys
sys.path.append('../')
import datasets
import log_reg
from dataproc import extract_wvs
from dataproc import get_discharge_summaries
from dataproc import concat_and_split
from dataproc import build_vocab
from dataproc import vocab_index_descriptions
from dataproc import word_embeddings
from constants import MIMIC_4_DIR, DATA_DIR

import numpy as np
import pandas as pd

from collections import Counter, defaultdict
import csv
import math
import operator

Let's do some data processing in a much better way, with a notebook.

First, let's define some stuff.

In [None]:
Y = 'full' #use all available labels in the dataset for prediction
notes_file = '%s/discharge.csv' % MIMIC_4_DIR # raw note events downloaded from MIMIC-III
vocab_size = 'full' #don't limit the vocab size to a specific number
vocab_min = 3 #discard tokens appearing in fewer than this many documents

# Data processing

## Combine diagnosis and procedure codes and reformat them

The codes in MIMIC-III are given in separate files for procedures and diagnoses, and the codes are given without periods, which might lead to collisions if we naively combine them. So we have to add the periods back in the right place.

In [None]:
dfcodes = pd.read_csv('%s/DIAGNOSES_ICD.csv' % MIMIC_4_DIR)

In [7]:
dfcodes = dfcodes.reset_index()

In [8]:

print(dfcodes.head(5))

   index  subject_id   hadm_id  seq_num icd_code  icd_version
0      0    10000032  22595853        1     5723            9
1      1    10000032  22595853        2    78959            9
2      2    10000032  22595853        3     5715            9
3      3    10000032  22595853        4    07070            9
4      4    10000032  22595853        5      496            9


In [None]:
dfcodes.to_csv('%s/ALL_CODES.csv' % MIMIC_4_DIR, index=False,
               columns=['index', 'subject_id', 'hadm_id', 'seq_num', 'icd_code'],
               header=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD10_CODE'])

## How many codes are there?

In [None]:
#In the full dataset (not just discharge summaries)
df = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_4_DIR, dtype={"ICD10_CODE": str})
len(df['ICD10_CODE'].unique())

28562

In [55]:
import importlib
import dataproc.get_discharge_summaries
importlib.reload(dataproc.get_discharge_summaries)

from dataproc.get_discharge_summaries import write_discharge_summaries


## Tokenize and preprocess raw text

Preprocessing time!

This will:
- Select only discharge summaries and their addenda
- remove punctuation and numeric-only tokens, removing 500 but keeping 250mg
- lowercase all tokens

In [None]:
#This reads all notes, selects only the discharge summaries, and tokenizes them, returning the output filename
disch_full_file = get_discharge_summaries.write_discharge_summaries(out_file="%s/disch_full.csv" % MIMIC_4_DIR)

processing notes file
writing to D:/Nehoray/AB University/NLP/Project/CAML env 2/caml-mimic/mimicdata/mimic3/disch_full.csv


331793it [04:44, 1167.58it/s]


Let's read this in and see what kind of data we're working with

In [None]:
df = pd.read_csv('%s/disch_full.csv' % MIMIC_4_DIR, encoding='ISO-8859-1')

In [5]:
#How many admissions?
len(df['HADM_ID'].unique())

331793

In [6]:
#Tokens and types
types = set()
num_tok = 0
for row in df.itertuples():
    for w in row[4].split():
        types.add(w)
        num_tok += 1

In [7]:
print("Num types", len(types))
print("Num tokens", str(num_tok))

Num types 373999
Num tokens 509366740


In [8]:
#Let's sort by SUBJECT_ID and HADM_ID to make a correspondence with the MIMIC-3 label file
df = df.sort_values(['SUBJECT_ID', 'HADM_ID'])

In [None]:
#Sort the label file by the same
dfl = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_4_DIR)
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])

In [10]:
len(df['HADM_ID'].unique()), len(dfl['HADM_ID'].unique())

(331793, 545497)

## Consolidate labels with set of discharge summaries

Looks like there were some HADM_ID's that didn't have discharge summaries, so they weren't included with our notes

In [None]:
#Let's filter out these HADM_ID's
hadm_ids = set(df['HADM_ID'])
with open('%s/ALL_CODES.csv' % MIMIC_4_DIR, 'r') as lf:
    with open('%s/ALL_CODES_filtered.csv' % MIMIC_4_DIR, 'w') as of:
        w = csv.writer(of)
        w.writerow(['SUBJECT_ID', 'HADM_ID', 'ICD10_CODE', 'ADMITTIME', 'DISCHTIME'])
        r = csv.reader(lf)
        #header
        next(r)
        for i,row in enumerate(r):
            hadm_id = int(row[2])
            #print(hadm_id)
            #break
            if hadm_id in hadm_ids:
                w.writerow(row[1:3] + [row[-1], '', ''])

now, filtering discharge notes that dont have codes

In [None]:
dfl = pd.read_csv('%s/ALL_CODES_filtered.csv' % MIMIC_4_DIR, index_col=None)

In [None]:
#Let's filter out these HADM_ID's
hadm_ids = set(dfl['HADM_ID'])
with open('%s/disch_full.csv' % MIMIC_4_DIR, 'r') as lf:
    with open('%s/disch_full_filtered.csv' % MIMIC_4_DIR, 'w') as of:
        w = csv.writer(of)
        w.writerow(['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT'])
        r = csv.reader(lf)
        #header
        next(r)
        for i,row in enumerate(r):
            hadm_id = int(row[1])
            #print(hadm_id)
            #break
            if hadm_id in hadm_ids:
                w.writerow(row)

In [None]:
abc_df = pd.read_csv('%s/disch_full.csv' % MIMIC_4_DIR)
abc_df_filtered = pd.read_csv('%s/disch_full_filtered.csv' % MIMIC_4_DIR)


In [40]:
print(len(abc_df['HADM_ID'].unique()))
print(len(abc_df_filtered['HADM_ID'].unique()))

331793
331604


In [28]:
print(29947355 in hadm_ids)

True


In [13]:
len(dfl['HADM_ID'].unique())

331604

In [None]:
#we still need to sort it by HADM_ID
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])
dfl.to_csv('%s/ALL_CODES_filtered.csv' % MIMIC_4_DIR, index=False)

In [None]:
dfdl = pd.read_csv('%s/disch_full_filtered.csv' % MIMIC_4_DIR, encoding='ISO-8859-1')

In [None]:
dfdl = dfdl.sort_values(['SUBJECT_ID', 'HADM_ID'])
dfdl.to_csv('%s/disch_full_filtered.csv' % MIMIC_4_DIR, index=False)

## Append labels to notes in a single file

In [None]:
#Now let's append each instance with all of its codes
#this is pretty non-trivial so let's use this script I wrote, which requires the notes to be written to file
sorted_file = '%s/disch_full_filtered.csv' % MIMIC_4_DIR
df.to_csv(sorted_file, index=False)

In [44]:
import importlib
import dataproc.concat_and_split
importlib.reload(dataproc.concat_and_split)

<module 'dataproc.concat_and_split' from 'd:\\Nehoray\\AB University\\NLP\\Project\\CAML env 2\\caml-mimic\\notebooks\\..\\dataproc\\concat_and_split.py'>

In [None]:
labeled = concat_and_split.concat_data('%s/ALL_CODES_filtered.csv' % MIMIC_4_DIR, sorted_file)

CONCATENATING
0 done
10000 done
20000 done
30000 done
40000 done
50000 done
60000 done
70000 done
80000 done
90000 done
100000 done
110000 done
120000 done
130000 done
140000 done
150000 done
160000 done
170000 done
180000 done
190000 done
200000 done
210000 done
220000 done
230000 done
240000 done
250000 done
260000 done
270000 done
280000 done
290000 done
300000 done
310000 done
320000 done
330000 done


In [46]:
#name of the file we just made
print(labeled)

D:/Nehoray/AB University/NLP/Project/CAML env 2/caml-mimic/mimicdata/mimic3/notes_labeled.csv


Let's sanity check the combined data we just made. Do we have all hadm id's accounted for, and the same vocab stats?

In [47]:
dfnl = pd.read_csv(labeled)
#Tokens and types
types = set()
num_tok = 0
for row in dfnl.itertuples():
    for w in row[3].split():
        types.add(w)
        num_tok += 1

In [48]:
print("num types", len(types), "num tokens", num_tok)

num types 373918 num tokens 509132957


In [49]:
len(dfnl['HADM_ID'].unique())

331604

In [None]:
abc_df = pd.read_csv('%s/notes_labeled.csv' % MIMIC_4_DIR)
print(abc_df.head())

   SUBJECT_ID   HADM_ID                                               TEXT  \
0    10000032  22595853  name ___ unit no ___ admission date ___ discha...   
1    10000032  22841357  name ___ unit no ___ admission date ___ discha...   
2    10000032  25742920  name ___ unit no ___ admission date ___ discha...   
3    10000032  29079034  name ___ unit no ___ admission date ___ discha...   
4    10000084  23052089  name ___ unit no ___ admission date ___ discha...   

                                              LABELS  
0        5723;78959;5715;07070;496;29680;30981;V1582  
1            07071;78959;2875;2761;496;5715;V08;3051  
2  07054;78959;V462;5715;2767;2761;496;V08;3051;7...  
3  45829;07044;7994;2761;78959;2767;3051;V08;V498...  
4                   G3183;F0280;R441;R296;E785;Z8546  


## Create train/dev/test splits

In [9]:
import importlib
import dataproc.concat_and_split
importlib.reload(dataproc.concat_and_split)

<module 'dataproc.concat_and_split' from 'd:\\Nehoray\\AB University\\NLP\\Project\\CAML env 2\\caml-mimic\\notebooks\\..\\dataproc\\concat_and_split.py'>

In [None]:
fname = '%s/notes_labeled.csv' % MIMIC_4_DIR
base_name = "%s/disch" % MIMIC_4_DIR #for output
tr, dv, te = concat_and_split.split_data(fname, base_name=base_name)

SPLITTING
0 read
10000 read
20000 read
30000 read
40000 read
50000 read
60000 read
70000 read
80000 read
90000 read
100000 read
110000 read
120000 read
130000 read
140000 read
150000 read
160000 read
170000 read
180000 read
190000 read
200000 read
210000 read
220000 read
230000 read
240000 read
250000 read
260000 read
270000 read
280000 read
290000 read
300000 read
310000 read
320000 read
330000 read


## Build vocabulary from training data

In [None]:
vocab_min = 3
vname = '%s/vocab.csv' % MIMIC_4_DIR
build_vocab.build_vocab(vocab_min, tr, vname)

reading in data...
removing rare terms
70760 terms qualify out of 181537 total
writing output


## Sort each data split by length for batching

In [None]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/disch_%s_split.csv' % (MIMIC_4_DIR, splt)
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_full.csv' % (MIMIC_4_DIR, splt), index=False)

## Pre-train word embeddings

Let's train word embeddings on all words

In [16]:
import importlib
import dataproc.word_embeddings
importlib.reload(dataproc.word_embeddings)

<module 'dataproc.word_embeddings' from 'd:\\Nehoray\\AB University\\NLP\\Project\\CAML env 2\\caml-mimic\\notebooks\\..\\dataproc\\word_embeddings.py'>

In [None]:
w2v_file = word_embeddings.word_embeddings('full', '%s/disch_full.csv' % MIMIC_4_DIR, 100, 0, 5)

building word2vec vocab on D:/Nehoray/AB University/NLP/Project/CAML env 2/caml-mimic/mimicdata/mimic3/disch_full.csv...
training...
writing embeddings to D:/Nehoray/AB University/NLP/Project/CAML env 2/caml-mimic/mimicdata/mimic3/processed_full.w2v


## Write pre-trained word embeddings with new vocab

In [21]:
import importlib
import dataproc.extract_wvs
importlib.reload(dataproc.extract_wvs)

<module 'dataproc.extract_wvs' from 'd:\\Nehoray\\AB University\\NLP\\Project\\CAML env 2\\caml-mimic\\notebooks\\..\\dataproc\\extract_wvs.py'>

In [None]:
extract_wvs.gensim_to_embeddings('%s/processed_full.w2v' % MIMIC_4_DIR, '%s/vocab.csv' % MIMIC_4_DIR, Y)

100%|██████████| 70760/70760 [00:00<00:00, 152062.84it/s]


# Pre-process ICD-10 codes description file

In [24]:
def transform_icd_format(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            if line.strip():  # Skip empty lines
                # Extract the code (position 7-14) and the first description (position 15-70)
                code = line[6:14].strip()
                description = line[15:70].strip()
                
                # Write the formatted line with a tab separator
                outfile.write(f"{code}\t{description}\n")

# Usage
input_file = '%s/ICD10_raw_desc' % DATA_DIR
output_file = '%s/ICD10_descriptions' % DATA_DIR
transform_icd_format(input_file, output_file)

## Pre-process code descriptions using the vocab

In [None]:
vocab_index_descriptions.vocab_index_descriptions('%s/vocab.csv' % MIMIC_4_DIR,
                                                  '%s/description_vectors.vocab' % MIMIC_4_DIR)

100%|██████████| 97586/97586 [00:00<00:00, 124967.29it/s]


## Filter each split to the top 50 diagnosis/procedure codes

In [26]:
Y = 50

In [None]:
#first calculate the top k
counts = Counter()
dfnl = pd.read_csv('%s/notes_labeled.csv' % MIMIC_4_DIR)
for row in dfnl.itertuples():
    for label in str(row[4]).split(';'):
        counts[label] += 1

In [28]:
codes_50 = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

In [29]:
codes_50 = [code[0] for code in codes_50[:Y]]

In [30]:
codes_50

['4019',
 '2724',
 'E785',
 'I10',
 '53081',
 'Z87891',
 '25000',
 '4280',
 '42731',
 'K219',
 '41401',
 'V1582',
 '311',
 '5849',
 '2449',
 'F329',
 'I2510',
 '2859',
 '3051',
 '40390',
 'N179',
 'V5861',
 'F419',
 '30000',
 '5990',
 'V5867',
 '5859',
 '49390',
 '2720',
 'Z7901',
 'Z794',
 '32723',
 'E039',
 'E119',
 'V4582',
 '412',
 'G4733',
 'D649',
 '496',
 'V5866',
 'E669',
 '27800',
 'I4891',
 '2761',
 'F17210',
 'Y929',
 'V4581',
 '73300',
 'Z66',
 'J45909']

In [None]:
with open('%s/TOP_%s_CODES.csv' % (MIMIC_4_DIR, str(Y)), 'w') as of:
    w = csv.writer(of)
    for code in codes_50:
        w.writerow([code])

In [None]:
for splt in ['train', 'dev', 'test']:
    print(splt)
    hadm_ids = set()
    with open('%s/%s_50_hadm_ids.csv' % (MIMIC_4_DIR, splt), 'r') as f:
        for line in f:
            hadm_ids.add(line.rstrip())
    with open('%s/notes_labeled.csv' % MIMIC_4_DIR, 'r') as f:
        with open('%s/%s_%s.csv' % (MIMIC_4_DIR, splt, str(Y)), 'w') as of:
            r = csv.reader(f)
            w = csv.writer(of)
            #header
            w.writerow(next(r))
            i = 0
            for row in r:
                if(row == []):
                    continue
                hadm_id = row[1]
                if hadm_id not in hadm_ids:
                    continue
                codes = set(str(row[3]).split(';'))
                filtered_codes = codes.intersection(set(codes_50))
                if len(filtered_codes) > 0:
                    w.writerow(row[:3] + [';'.join(filtered_codes)])
                    i += 1

train
dev
test


In [None]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/%s_%s.csv' % (MIMIC_4_DIR, splt, str(Y))
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_%s.csv' % (MIMIC_4_DIR, splt, str(Y)), index=False)