In [None]:
use_counts = False

import sys
sys.path.append('../scripts/')
import pandas as pd
import numpy as np
from pathlib import Path
import os
from tqdm.auto import tqdm
import re
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None
mimic_benchmark_dir = Path('/scratch/gobi2/haoran/shared_data/MIMIC_benchmarks/') # update

In [None]:
# reproduced from https://github.com/YerevaNN/mimic3-benchmarks/blob/master/mimic3benchmark/readers.py
class Reader(object):
    def __init__(self, dataset_dir, listfile=None):
        self._dataset_dir = dataset_dir
        self._current_index = 0 
        if listfile is None:
            listfile_path = os.path.join(dataset_dir, "listfile.csv")
        else:
            listfile_path = listfile
        with open(listfile_path, "r") as lfile:
            self._data = lfile.readlines()
        self._listfile_header = self._data[0]
        self._data = self._data[1:]

    def get_number_of_examples(self):
        return len(self._data)

    def random_shuffle(self, seed=None):
        if seed is not None:
            random.seed(seed)
        random.shuffle(self._data)

    def read_example(self, index):
        raise NotImplementedError()

    def read_next(self):
        to_read_index = self._current_index
        self._current_index += 1
        if self._current_index == self.get_number_of_examples():
            self._current_index = 0 
        return self.read_example(to_read_index)
    
class InHospitalMortalityReader(Reader):
    def __init__(self, dataset_dir, listfile=None, period_length=48.0):
        """ Reader for in-hospital moratality prediction task.

        :param dataset_dir:   Directory where timeseries files are stored.
        :param listfile:      Path to a listfile. If this parameter is left `None` then
                              `dataset_dir/listfile.csv` will be used.
        :param period_length: Length of the period (in hours) from which the prediction is done.
        """
        Reader.__init__(self, dataset_dir, listfile)
        self._data = [line.split(',') for line in self._data]
        self._data = [(x, int(y)) for (x, y) in self._data]
        self._period_length = period_length

    def _read_timeseries(self, ts_filename):
        ret = []
        with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile:
            header = tsfile.readline().strip().split(',')
            assert header[0] == "Hours"
            for line in tsfile:
                mas = line.strip().split(',')
                ret.append(np.array(mas))
        return (np.stack(ret), header)

    def read_example(self, index):
        """ Reads the example with given index.

        :param index: Index of the line of the listfile to read (counting starts from 0).
        :return: Dictionary with the following keys:
            X : np.array
                2D array containing all events. Each row corresponds to a moment.
                First column is the time and other columns correspond to different
                variables.
            t : float
                Length of the data in hours. Note, in general, it is not equal to the
                timestamp of last event.
            y : int (0 or 1)
                In-hospital mortality.
            header : array of strings
                Names of the columns. The ordering of the columns is always the same.
            name: Name of the sample.
        """
        if index < 0 or index >= len(self._data):
            raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).")

        name = self._data[index][0]
        t = self._period_length
        y = self._data[index][1]
        (X, header) = self._read_timeseries(name)

        return {"X": X,
                "t": t,
                "y": y,
                "header": header,
                "name": name}

In [None]:
train_reader = InHospitalMortalityReader(dataset_dir=mimic_benchmark_dir/'in-hospital-mortality' / 'train')
test_reader = InHospitalMortalityReader(dataset_dir=mimic_benchmark_dir/'in-hospital-mortality' / 'test')
all_stays = pd.read_csv(os.path.join(mimic_benchmark_dir, 'root/', 'all_stays.csv'), parse_dates = ['INTIME']).set_index('ICUSTAY_ID')

In [None]:
time_bins = [0, 12, 24, 36, 48]

features = [
#     'Hours',
#  'Capillary refill rate',
 'Diastolic blood pressure',
    'Fraction inspired oxygen',
#  'Glascow coma scale eye opening',
#  'Glascow coma scale motor response',
 'Glascow coma scale total',
#  'Glascow coma scale verbal response',
 'Glucose',
 'Heart Rate',
 # 'Height',
 'Mean blood pressure',
 'Oxygen saturation',
 'Respiratory rate',
 'Systolic blood pressure',
 'Temperature',
 'Weight',
 'pH']

In [None]:
def map_eth(string):
    string = string.lower()
    if bool(re.search('^white', string)):
        return "white"
    elif bool(re.search('^black', string)):
        return "black"
    else:
        return "other"

In [None]:
data = {}
meta = {}
for fold in ['train', 'test']:
    reader = train_reader if fold =='train' else test_reader
    for count, i in enumerate(tqdm(range(reader.get_number_of_examples()))):
        ex = reader.read_example(i)
        df = pd.DataFrame(ex['X'], columns = ex['header'])[['Hours'] + features]
        df = df.replace('', np.nan).astype(np.float32).sort_values(by = 'Hours', ascending = True)
        df = df[df.Hours < ex['t']]
        ind = np.digitize(df['Hours'], time_bins)
        df = df.groupby(ind).agg(['mean', 'count'])
        df.columns = [' '.join(col).strip() for col in df.columns.values]
        df = df.drop(columns = ['Hours mean', "Hours count"])  
        # df = df.reset_index().rename(columns = {'index': 't'})
        extra = {'target': ex['y']} 
        
        subj_id = int(ex['name'].split('_')[0])
        stay = pd.read_csv(os.path.join(mimic_benchmark_dir, 'root', fold, str(subj_id), ex['name'].split('_')[1]+'.csv')).iloc[0]
        extra['Gender'] = 'M' if int(stay['Gender']) == 2 else 'F'
        extra['Age'] = float(stay['Age'])
        extra['fold_id'] = fold
        extra['Race'] = map_eth(all_stays.loc[stay.Icustay, 'ETHNICITY'])
        pid = ex['name'][:-4]
        meta[pid] = extra
        data[pid] = df

In [None]:
df = pd.concat(data)
df.index.rename(['id', 't'], inplace = True)

In [None]:
meta_df = pd.DataFrame.from_dict(meta, orient = 'index')

In [None]:
iterables = [np.unique(df.index.get_level_values(0).tolist()), list(range(1, len(time_bins)))]
multiind = pd.MultiIndex.from_product(iterables, names = ['id', 't'])
ind_df = pd.DataFrame(index = multiind)
df = pd.merge(ind_df, df, left_index = True, right_index = True, how = 'left').sort_index()

In [None]:
for i in df.columns:
    if i.endswith('count'):
        df[i] = df[i].fillna(0)

In [None]:
# computes values for imputation on training set
df2 = df.copy()
df2 = df2.reset_index().pivot_table(index = 'id', columns = 't')
df2.columns = ['_'.join(map(str, reversed(col))).strip() for col in df2.columns.values]
df2 = pd.merge(df2, meta_df, left_index = True, right_index = True, how = 'inner')

_, val_ids = train_test_split(df2[df2.fold_id == 'train'].index, test_size = 0.25, 
                              random_state = 42, stratify = df2.loc[df2.fold_id == 'train','Race'])
df2.loc[val_ids, 'fold_id'] = 'eval'

train_ids = list(df2[df2.fold_id == 'train'].index)
impute_vals = df.loc[train_ids].reset_index().groupby('t').apply(lambda x: {i: x[i].mean() for i in x if i.endswith('mean')})

In [None]:
def impute_func(series):
    for ind, i in series.iteritems():
        if pd.isnull(i):
            series[ind] = impute_vals[ind[1]][series.name]
            
    return series

In [None]:
df = df.groupby('id').ffill()
df = df.groupby('id').transform(impute_func)
df = df.reset_index().pivot_table(index = 'id', columns = 't')
df.columns = ['_'.join(map(str, reversed(col))).strip() for col in df.columns.values]
df = pd.merge(df, meta_df, left_index = True, right_index = True, how = 'inner')
df.loc[val_ids, 'fold_id'] = 'eval'

In [None]:
if not use_counts:
    cols_to_drop = []
    for i in df.columns:
        if i.endswith('count'):
            cols_to_drop.append(i)
    df = df.drop(columns = cols_to_drop)

In [None]:
df.rename_axis('ID').reset_index().to_csv('./mimic_mortality_tabular.csv')