# Exploring mixehr outputs

In [None]:
import os 
import numpy as np
import pandas as pd
import pickle as pkl
import warnings
import pdb

In [None]:
def add_subj_id_to_df(df, id_list):
    """
    assuming the id_list and df have a 1:1 correspondence, 
    add id_list as a column
    """
    df.insert(0, 'subj_id', id_list)
    return df

    
def rename_topic_df(df):
    """
    add topic{index} as column name
    """
    topic_names = [f'topic{x}' for x in range(len(df.columns))]
    df.columns = topic_names
    return df

def load_paths(topic_path, id_list_path):
    """
    load csv of topics without hadm ids, and pickle of id list. Check their lengths are the same
    Returns:
        topic_df: df with each row as a patient, each column a topic, each value a likelihood of belonging to the topic, id_list: list of hadm_ids
    """
    #pdb.set_trace()
    topic_df = pd.read_csv(topic_path, header=None)
    with open(id_list_path, 'rb') as id_list:
        ids = pkl.load(id_list)
    try:
        id_len = len(ids)
        topic_len = len(topic_df)
        assert id_len == topic_len, (f'id length is {id_len} but topic df is length {topic_len}')
    except:
        #pdb.set_trace()
        warnings.warn(f'id length is {id_len} but topic df is length {topic_len} on path {topic_path}. ignoring last patients for now')
        if id_len > topic_len:
            ids = ids[:topic_len]
        if topic_len > id_len:
            topic_df = topic_df.iloc[:id_len, :]
    return topic_df, ids

def add_matching_hadm(hadm_subj_csv, subj_df, hadm_id_path):
    """
    Add hadm_ids to df according to hadm_id pickle
    Args:
        hadm_subj_csv: path to csv containing hadm_id and corresponding subj_id
        subj_df: processed df which contains subj_id, which we want to append corresponding hadm_id to
        hadm_id_path: path to pickle containing list of hadm_ids used 
    """
    with open(hadm_id_path, 'rb') as id_list:
        hadm_ids = pkl.load(id_list)
    
    hadm_subj_df = pd.read_csv(hadm_subj_csv)
    subj_ids = list(subj_df['subj_id'].values)
    filtered_hadm_subj_df = hadm_subj_df[hadm_subj_df['HADM_ID'].isin(hadm_ids)]
    sorted_hadm_ids = []
    for subj_id in subj_ids:
        # find the corresponding hadm_id
        matching_rows = filtered_hadm_subj_df[filtered_hadm_subj_df['SUBJECT_ID'] == subj_id]
        sorted_hadm_ids.append(matching_rows['HADM_ID'].values[0])
        assert len(matching_rows) == 1, (f'greater than one match found for subj id {subj_id}')
    
    subj_df.insert(0, 'hadm_id', sorted_hadm_ids)
    return subj_df

def format_df_for_pipeline(topic_path, id_list_path, hadm_id_path):
    """
    rename topic df with topics and hadm id
    """
    df, id_list = load_paths(topic_path, id_list_path)
    df = rename_topic_df(df)
    df = add_subj_id_to_df(df, id_list)
    df = add_matching_hadm('../data/mimic3/ADMISSIONS.csv', df, hadm_id_path) 
    return df




## define paths
each key is a run type. "labs" is mixehr topics from labs results only. "labs_notes" is both labs and clinical notes, etc. 
K75 indicates 75 topics total

In [None]:
test_topic_paths = {
    'labs': 'test_mixehr_early_no_waveforms_train_mixehr_early_no_waveforms_JCVB0_nmar_K75_iter500_metaphe',
    'labs_notes': 'test_mixehr_early_no_waveforms_train_mixehr_early_no_waveforms_JCVB0_nmar_K75_iter500_metaphe',
    'labs_notes_ecg': 'test_mixehr_early_with_ecg_quantiles_train_mixehr_early_with_ecg_quantiles_JCVB0_nmar_K75_iter500_metaphe',
    'ecg': 'test_mixehr_early_only_ecg_quantiles_train_mixehr_early_only_ecg_quantiles_JCVB0_nmar_K75_iter156_metaphe'
}

val_topic_paths = val_topic_paths = {
 key: val.replace('test', 'vali') for key, val in test_topic_paths.items()   
}

train_topic_paths = {
    'labs': 'train_mixehr_early_no_notes_no_waveforms_train_mixehr_early_no_notes_no_waveforms_JCVB0_nmar_K75_iter500_metaphe',
    'labs_notes': 'train_mixehr_early_no_waveforms_train_mixehr_early_no_waveforms_JCVB0_nmar_K75_iter500_metaphe',
    'labs_notes_ecg': 'train_mixehr_early_with_ecg_quantiles_train_mixehr_early_with_ecg_quantiles_JCVB0_nmar_K75_iter500_metaphe',
    #'ecg': 'train_mixehr_early_only_ecg_quantiles_train_mixehr_early_only_ecg_quantiles_JCVB0_nmar_K75_iter156_metaphe'
}

assert all([key_train == key_test for key_train, key_test in zip(test_topic_paths.keys(), train_topic_paths.keys())]), ('not all keys matched')
modality_names = train_topic_paths.keys()

In [None]:
test_topic_paths = {
    'labs': 'test_mixehr_no_waveforms_train_mixehr_no_waveforms_JCVB0_nmar_K75_iter500_metaphe',
    'labs_notes': 'test_mixehr_no_waveforms_train_mixehr_no_waveforms_JCVB0_nmar_K75_iter500_metaphe',
    'labs_notes_ecg': 'test_mixehr_with_ecg_quantiles_train_mixehr_with_ecg_quantiles_JCVB0_nmar_K75_iter500_metaphe',
    'ecg': 'test_mixehr_only_ecg_quantiles_train_mixehr_only_ecg_quantiles_JCVB0_nmar_K75_iter500_metaphe'
}

train_topic_paths = {
    'labs': 'train_mixehr_no_notes_no_waveforms_train_mixehr_no_notes_no_waveforms_JCVB0_nmar_K75_iter500_metaphe',
    'labs_notes': 'train_mixehr_no_waveforms_train_mixehr_no_waveforms_JCVB0_nmar_K75_iter500_metaphe',
    'labs_notes_ecg': 'train_mixehr_with_ecg_quantiles_train_mixehr_with_ecg_quantiles_JCVB0_nmar_K75_iter500_metaphe',
    'ecg': test_topic_paths['ecg'].replace('test', 'train')
}

val_topic_paths = {
    key: val.replace('test', 'vali') for key, val in test_topic_paths.items()   
}

In [None]:
def get_paths(modality, split_name):
    """
    return a dict containing the raw topic path, the save path for the processed df, 
    and the path to the id list
    """
    save_base = '../data/supervised_pipeline'
    data_base = '../data/mixehr/'
    Path(save_base).mkdir(parents=True, exist_ok=True)
    if split_name == 'train':
        topic_path = train_topic_paths[modality] + '.csv'
    elif split_name == 'test':
        topic_path = test_topic_paths[modality] + '.csv'
    elif split_name == 'vali':
        topic_path = val_topic_paths[modality] + '.csv'
    else:
        raise ValueError(f'{split_name} must be either train or test')
    paths = {
        'save': os.path.join(save_base, f'{modality}_topics.csv'),
        'raw_topics': os.path.join(data_base, topic_path),
        'id_list': f'{data_base}/{split_name}_subj_ids_list.pkl',
        'hadm_id_list': f'{data_base}/{split_name}_hadm_ids_list.pkl'
    }
    return paths

In [None]:
from pathlib import Path
splits = ['train', 'vali', 'test']
for modality in modality_names:
    all_dfs = []
    all_split_names = []
    for split in splits:
        paths = get_paths(modality, split)
        df = format_df_for_pipeline(paths['raw_topics'], paths['id_list'], paths['hadm_id_list'])
        split_name = [split] * len(df)
        all_dfs.append(df)
        all_split_names.extend(split_name)

    all_dfs = pd.concat(all_dfs)
    all_dfs.insert(0, 'split', all_split_names)
    all_dfs.to_csv(paths['save'], index=False)
    print(f'saved {modality} topics to {paths["save"]}')



# add label information
now we have a df for each set. next we want to combine it with LOS info 
this can be done in `make_matching_los_csv`, creating the file `labels_matched.csv`

# filtering trajectory results
big issue right now is we have many many ecg signals across time and we havent filtered. we should only use the first 12 hours, and we should only include the HADM_ids which have corresponding topics. Here, i will filter trajectories_with_features.csv for ts_idx=1, and hadm_id matching hadm_id_list

In [None]:
ecg_path = '../data/trajectories_with_features.csv'
# now we filter for the first ts_index
ecg_df = pd.read_csv(ecg_path)
ecg_df = ecg_df[ecg_df['ts_idx'] == 1]
ecg_savepath = '../data/filtered_ecg_features.csv'
ecg_df.to_csv(ecg_savepath, index=False)

In [None]:
from make_matched_los_csv import main
label_csv = 'mimic3/ADMISSIONS'
label_cols = {
    'los': 'length_of_stay',
    'mort': 'HOSPITAL_EXPIRE_FLAG'
}
main(ecg_path, label_csv, label_cols)