# Exploring the MIMIC 3 NIHSS dataset

In [None]:
import warnings

import pandas as pd
import numpy as np

In [None]:
mimic_nihss_train_path = '/Users/jk1/stroke_datasets/national-institutes-of-health-stroke-scale-nihss-annotations-for-the-mimic-iii-database-1.0.0/NER_Train.txt'

mimic_nihss_test_path = '/Users/jk1/stroke_datasets/national-institutes-of-health-stroke-scale-nihss-annotations-for-the-mimic-iii-database-1.0.0/NER_Test.txt'

In [None]:
output_dir = '/Users/jk1/stroke_datasets/national-institutes-of-health-stroke-scale-nihss-annotations-for-the-mimic-iii-database-1.0.0'

In [None]:
with open(mimic_nihss_train_path, 'r') as textfile:
    mimic_nihss_train_text = textfile.read()

with open(mimic_nihss_test_path, 'r') as textfile:
    mimic_nihss_test_text = textfile.read()

In [None]:
mimic_nihss_train_data = eval(mimic_nihss_train_text)
mimic_nihss_test_data = eval(mimic_nihss_test_text)

In [None]:
def flatten(list):
    return [item for sublist in list for item in sublist]

In [None]:
def get_measurement(parsed_subj_nlp_db, parsed_subj_nlp_df, subj_relationship_df, label):
    if label not in parsed_subj_nlp_df.label.unique():
        return np.nan

    label_t_id = parsed_subj_nlp_df[parsed_subj_nlp_df.label == label].t_id.values[0]

    if label_t_id not in subj_relationship_df.t_id1.unique():
        return np.nan

    target_t_id = subj_relationship_df[subj_relationship_df.t_id1 == label_t_id].t_id2.values[0]

    if target_t_id not in parsed_subj_nlp_df.t_id.unique():
        return np.nan

    target_token = parsed_subj_nlp_df[parsed_subj_nlp_df.t_id == target_t_id].token_position1.values[0]

    if not parsed_subj_nlp_db['token'][target_token].isdigit():
        return np.nan

    return int(parsed_subj_nlp_db['token'][target_token])

In [None]:
mimic_nihss_selected_data =  mimic_nihss_test_data

In [None]:
unique_labels = []
for subj in mimic_nihss_selected_data:
    unique_labels.append(pd.DataFrame(subj['entities'], columns=['t_id', 'label', 'token_position1', 'token_position2']).label.unique())

In [None]:
nihss_item_labels = [
'NIHSS',
'1a_LOC',
'1b_LOCQuestions',
'1c_LOCCommands',
'2_BestGaze',
'3_Visual',
'4_FacialPalsy',
'5a_LeftArm',
'5b_RightArm',
'6a_LeftLeg',
'6b_RightLeg',
'7_LimbAtaxia',
'8_Sensory',
'9_BestLanguage',
'10_Dysarthria',
'11_ExtinctionInattention'
]

In [None]:
def get_nihss_decomposition_df(mimic_nihss_selected_data, nihss_item_labels):
    all_subjs_df = pd.DataFrame(columns=['hadm_id'] + nihss_item_labels)

    for subj in mimic_nihss_selected_data:
        subj_df = pd.DataFrame(subj['entities'], columns=['t_id', 'label', 'token_position1', 'token_position2'])
        subj_relationship_df = pd.DataFrame(subj['relations'], columns=['t_id1', 't_id2', 'r_id', 'relationship'])
        subj_relationship_df = subj_relationship_df[subj_relationship_df.relationship == 'Has_Value']

        nihss_item_measurements = []
        for nihss_item in nihss_item_labels:
            nihss_item_measurements.append(get_measurement(subj, subj_df, subj_relationship_df, nihss_item))

        if nihss_item_measurements[0] != np.nansum(nihss_item_measurements[1:]):
            print(f'WARNING: For subj {subj["HADM_ID"]}, sum of components of NIHSS should equal total', nihss_item_measurements[0], np.nansum(nihss_item_measurements[1:]))

        subj_measurements_df = pd.DataFrame([[subj['HADM_ID']] + nihss_item_measurements], columns=['hadm_id'] + nihss_item_labels, index=[0])
        all_subjs_df = all_subjs_df.append(subj_measurements_df, ignore_index=True)

    return all_subjs_df



In [None]:
mimic_nihss_train_df = get_nihss_decomposition_df(mimic_nihss_train_data, nihss_item_labels)
mimic_nihss_test_df = get_nihss_decomposition_df(mimic_nihss_test_data, nihss_item_labels)

In [None]:
overall_mimic_df = mimic_nihss_train_df.append(mimic_nihss_test_df)

In [None]:
overall_mimic_df

In [None]:
import os
# overall_mimic_df.to_csv(os.path.join(output_dir, 'mimic_nihss_database.csv'))

# Find fraction of patients having a NIHSS decomposition

In [None]:
notes_df_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/mimic_data/combined_notes_labels.xlsx'
notes_df = pd.read_excel(notes_df_path)

In [None]:
len(set(notes_df.hadm_id)&set(overall_mimic_df.hadm_id)) / len(set(notes_df.hadm_id))

In [None]:
set(notes_df.hadm_id)&set(overall_mimic_df.hadm_id)