# Attempt of extraction of patient notes

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import os

In [None]:
%matplotlib inline
plt.style.use('ggplot')

# information used to create a database connection
sqluser = 'postgres'
sqlpassword = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to postgres with a copy of the MIMIC-III database
# con = psycopg2.connect(dbname=dbname, user=sqluser)
con = psycopg2.connect(dbname=dbname, user=sqluser, password=sqlpassword, port=5000, host='localhost')

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = 'set search_path to ' + schema_name + ';'

In [None]:
from prediction.utils.utils import ensure_dir

output_dir = '/Users/jk1/temp/mimic/extraction/'

In [None]:
selection_query_path = '../patient_selection_query.sql'
# load in the text of the query
with open(selection_query_path) as fp:
    patient_selection_query = ''.join(fp.readlines())

## Extract admission notes

In [None]:
query = query_schema + patient_selection_query + """

SELECT selection.subject_id, selection.hadm_id, selection.icustay_id, admissions.admittime, notes.charttime, icustays.intime, notes.category, notes.description, notes.text

FROM selection

LEFT JOIN noteevents as notes
    on selection.hadm_id = notes.hadm_id

LEFT JOIN admissions
    on selection.hadm_id = admissions.hadm_id

LEFT JOIN icustays
    on selection.hadm_id = icustays.hadm_id


WHERE selection.exclusion_discharge_diagnosis = 0
    AND selection.exclusion_first_stay = 0
    AND selection.exclusion_age = 0
    AND selection.exclusion_los = 0
    AND selection.exclusion_non_urgent = 0
    AND selection.exclusion_admission_diagnosis = 0

    AND LOWER(notes.description) LIKE '%admission%'
    AND notes.description != 'Social Work Admission Note'
"""
admission_notes_df = pd.read_sql_query(query, con)
admission_notes_df

In [None]:
admission_notes_df.hadm_id.nunique()

In [None]:
admission_notes_df.category.value_counts()

In [None]:
admission_notes_df.description.value_counts()


## Extract discharge notes

In [None]:
query = query_schema + patient_selection_query + """

SELECT selection.subject_id, selection.hadm_id, selection.icustay_id, admissions.admittime, notes.charttime, icustays.intime, notes.category, notes.description, notes.text

FROM selection

LEFT JOIN noteevents as notes
    on selection.hadm_id = notes.hadm_id

LEFT JOIN admissions
    on selection.hadm_id = admissions.hadm_id

LEFT JOIN icustays
    on selection.hadm_id = icustays.hadm_id

WHERE selection.exclusion_discharge_diagnosis = 0
    AND selection.exclusion_first_stay = 0
    AND selection.exclusion_age = 0
    AND selection.exclusion_los = 0
    AND selection.exclusion_non_urgent = 0
    AND selection.exclusion_admission_diagnosis = 0

    AND notes.category = 'Discharge summary'
"""

discharge_notes_df = pd.read_sql_query(query, con)
discharge_notes_df

In [None]:
discharge_notes_df.head()

In [None]:
discharge_notes_df.hadm_id.nunique()

In [None]:
discharge_notes_df.category.value_counts()

In [None]:
discharge_notes_df.description.value_counts()

## Discharge notes containing NIHSS

specific string: NIH

In [None]:
query = query_schema + patient_selection_query + """

SELECT selection.subject_id, selection.hadm_id, selection.icustay_id, admissions.admittime, notes.charttime, icustays.intime, notes.category, notes.description, notes.text

FROM selection

LEFT JOIN noteevents as notes
    on selection.hadm_id = notes.hadm_id

LEFT JOIN admissions
    on selection.hadm_id = admissions.hadm_id

LEFT JOIN icustays
    on selection.hadm_id = icustays.hadm_id

WHERE selection.exclusion_discharge_diagnosis = 0
    AND selection.exclusion_first_stay = 0
    AND selection.exclusion_age = 0
    AND selection.exclusion_los = 0
    AND selection.exclusion_non_urgent = 0
    AND selection.exclusion_admission_diagnosis = 0

    AND notes.category = 'Discharge summary'
    AND LOWER(notes.text) LIKE '%nih%'
"""

nih_discharge_notes_df = pd.read_sql_query(query, con)
nih_discharge_notes_df

## Extract social work notes
-> should be used for mRS

In [None]:
query = query_schema + patient_selection_query + """

SELECT selection.subject_id, selection.hadm_id, selection.icustay_id, admissions.admittime, notes.charttime, icustays.intime, notes.category, notes.description, notes.text

FROM selection

LEFT JOIN noteevents as notes
    on selection.hadm_id = notes.hadm_id

LEFT JOIN admissions
    on selection.hadm_id = admissions.hadm_id

LEFT JOIN icustays
    on selection.hadm_id = icustays.hadm_id

WHERE selection.exclusion_discharge_diagnosis = 0
    AND selection.exclusion_first_stay = 0
    AND selection.exclusion_age = 0
    AND selection.exclusion_los = 0
    AND selection.exclusion_non_urgent = 0
    AND selection.exclusion_admission_diagnosis = 0

    AND notes.category = 'Social Work'
"""

social_work_notes = pd.read_sql_query(query, con)
social_work_notes

## Save notes text

save discharge notes

In [None]:
discharge_notes_df_output_dir = os.path.join(output_dir, 'discharge_notes')
ensure_dir(discharge_notes_df_output_dir)

In [None]:
for index, row in discharge_notes_df.iterrows():
    save_file_index = 0
    save_path = os.path.join(discharge_notes_df_output_dir, f'{row.hadm_id}_{row.icustay_id}_{save_file_index}.txt')
    while os.path.exists(save_path):
        save_file_index += 1
        save_path = os.path.join(discharge_notes_df_output_dir, f'{row.hadm_id}_{row.icustay_id}_{save_file_index}.txt')

    with open(save_path, "w") as text_file:
        text_file.write(row.text)

In [None]:
nih_discharge_notes_df_output_dir = os.path.join(output_dir, 'nih_discharge_notes')
ensure_dir(nih_discharge_notes_df_output_dir)

In [None]:
for index, row in nih_discharge_notes_df.iterrows():
    save_file_index = 0
    save_path = os.path.join(nih_discharge_notes_df_output_dir, f'{row.hadm_id}_{row.icustay_id}_{save_file_index}.txt')
    while os.path.exists(save_path):
        save_file_index += 1
        save_path = os.path.join(nih_discharge_notes_df_output_dir, f'{row.hadm_id}_{row.icustay_id}_{save_file_index}.txt')

    with open(save_path, "w") as text_file:
        text_file.write(row.text)

save admission notes

In [None]:
admission_notes_df_output_dir = os.path.join(output_dir, 'admission_notes')
ensure_dir(admission_notes_df_output_dir)

In [None]:
for index, row in admission_notes_df.iterrows():
    save_file_index = 0
    save_path = os.path.join(admission_notes_df_output_dir, f'{row.hadm_id}_{row.icustay_id}_{save_file_index}.txt')
    while os.path.exists(save_path):
        save_file_index += 1
        save_path = os.path.join(admission_notes_df_output_dir, f'{row.hadm_id}_{row.icustay_id}_{save_file_index}.txt')

    with open(save_path, "w") as text_file:
        text_file.write(row.text)

save social work notes

In [None]:
social_notes_df_output_dir = os.path.join(output_dir, 'social_notes')
ensure_dir(social_notes_df_output_dir)

In [None]:
for index, row in social_work_notes.iterrows():
    save_file_index = 0
    save_path = os.path.join(social_notes_df_output_dir, f'{row.hadm_id}_{row.icustay_id}_{save_file_index}.txt')
    while os.path.exists(save_path):
        save_file_index += 1
        save_path = os.path.join(social_notes_df_output_dir, f'{row.hadm_id}_{row.icustay_id}_{save_file_index}.txt')

    with open(save_path, "w") as text_file:
        text_file.write(row.text)

Create excel files to label text files

In [None]:
column_names = ['admitted to ICU for stroke',
                'onset to ICU admission > 7d',
'admission NIHSS',
'prestroke mRS',
'stroke onset time',
'wake up stroke',
'IVT time',
'IAT time',
'Antihypert. drugs pre-stroke',
'Lipid lowering drugs pre-stroke',
'Antiplatelet drugs',
'Anticoagulants',
'MedHist Hypertension',
'MedHist Diabetes',
'MedHist Hyperlipidemia',
'MedHist Smoking',
'MedHist Atrial Fibr.',
'MedHist CHD',
'MedHist PAD',
'MedHist cerebrovascular_event']

In [None]:
admission_notes_labels_df = admission_notes_df[['hadm_id', 'icustay_id', 'admittime', 'charttime', 'intime']].copy().drop_duplicates(subset=['hadm_id', 'icustay_id'])
for column in column_names:
    admission_notes_labels_df[column] = np.nan

In [None]:
# admission_notes_labels_df.to_excel(os.path.join(admission_notes_df_output_dir, 'admission_notes_labels.xlsx'))

In [None]:
admission_notes_labels_df

In [None]:
discharge_notes_labels_df = discharge_notes_df[['hadm_id', 'icustay_id', 'admittime', 'charttime', 'intime']].copy().drop_duplicates(subset=['hadm_id', 'icustay_id'])
for column in column_names:
    discharge_notes_labels_df[column] = np.nan

In [None]:
# discharge_notes_labels_df.to_excel(os.path.join(discharge_notes_df_output_dir, 'discharge_notes_labels.xlsx'))

In [None]:
discharge_notes_labels_df

In [None]:
nih_discharge_notes_labels_df = nih_discharge_notes_df[['hadm_id', 'icustay_id', 'admittime', 'charttime', 'intime']].copy().drop_duplicates(subset=['hadm_id', 'icustay_id'])
for column in column_names:
    nih_discharge_notes_labels_df[column] = np.nan

In [None]:
# nih_discharge_notes_labels_df.to_excel(os.path.join(nih_discharge_notes_df_output_dir, 'nih_discharge_notes_labels.xlsx'))

In [None]:
social_notes_labels_df = social_work_notes[['hadm_id', 'icustay_id', 'admittime', 'charttime', 'intime']].copy().drop_duplicates(subset=['hadm_id', 'icustay_id'])
for column in column_names:
    social_notes_labels_df[column] = np.nan

In [None]:
# social_notes_labels_df.to_excel(os.path.join(social_notes_df_output_dir, 'social_notes_labels.xlsx'))