# Export a normalized section

This notebook exports a single section from the discharge summary. This notebook requires the `section-header-mapping.csv` file to be in the same directory.

Output is `{}-text.csv` - a CSV with the `row_id` and the section text for the desired section. It is possible to have `row_id` with empty text.

In [1]:
import pandas as pd
import numpy as np
import os
import re
import psycopg2
from IPython.display import display, HTML

Connect to the database.

In [3]:
# specify user/password/where the database is
sqluser = 'postgres'
sqlpass = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'
host = 'localhost'

query_schema = 'SET search_path to ' + schema_name + ';'

# connect to the database
con = psycopg2.connect(dbname=dbname, user=sqluser, password=sqlpass, host=host)

In [4]:
# load a single note
query = query_schema + """
select * from noteevents
where category = 'Discharge summary'
and description = 'Report'
"""
df = pd.read_sql_query(query, con)
df.columns

Index(['row_id', 'subject_id', 'hadm_id', 'chartdate', 'charttime',
       'storetime', 'category', 'description', 'cgid', 'iserror', 'text'],
      dtype='object')

In [5]:
# read the map
smap = pd.read_csv('section-header-mapping.csv', header=0)
smap.columns = ['header','header_fixed']
smap = smap.set_index('header')['header_fixed'].to_dict()

# regex for finding section headers
pattern = "\n[ ]*\n([A-z0-9 ]+)(:| WERE | INCLUD | IS | ARE)"

def cleanup_header_name(header_name):
    # normalize header name: remove preceeding/trailing whitespace, convert to lower case
    return header_name.lstrip(' ').rstrip(' ').lower()

In [11]:
desired_section = 'admission medications'
section_data = list()

for i, row in df.iterrows():
    row_id = row['row_id']
    text = row['text']
    hadm_id = row['hadm_id']
    
    # remove bad matches
    match_list = list()
    for match in re.finditer(pattern, text, re.M):
        header_name = cleanup_header_name(match.groups()[0])
        if header_name not in smap:
            # not a true section header
            continue
        else:
            match_list.append(match)
    
    current_note_list = list()
    prev_match = None
    
    section_text = ''
    for i in range(len(match_list)-1):
        # to extract a section:
        #   find next section header start index
        #   extract out from this section end index to next section start index
        match = match_list[i]
        next_match = match_list[i+1]
        
        header_name = cleanup_header_name(match.groups()[0])
        header_fixed = smap[header_name]
        
        if header_fixed == desired_section:
            section_text += text[match.end():next_match.start()]
            
    # check the last section header match
    if len(match_list)>0:
        match = match_list[-1]
        header_name = cleanup_header_name(match.groups()[0])
        header_fixed = smap[header_name]

        if header_fixed == desired_section:
            section_text += text[match.end():]
    
    # output to dataframe
    section_data.append( [row_id, section_text.lstrip('\n')] )
    
# info from section headers
sh = pd.DataFrame(section_data)
sh.columns = ['row_id','text']

In [13]:
# output to file
sh.to_csv('{}-text.csv'.format(desired_section))