# Computes means and standard variations on the train data and uses those statistics to normalize both the train and test data

In [1]:
import pandas as pd
import sys
import numpy as np

from tqdm import tqdm
# 导入上级目录
sys.path.append('..')
from mimic3models.preprocessing import Discretizer, Normalizer

In [2]:
dataset_dir='../mimic3benchmark/scripts/data/in-hospital-mortality'
train_listfile = os.path.join(dataset_dir, 'train_listfile.csv')
test_listfile = os.path.join(dataset_dir, 'test_listfile.csv')
result_dir = 'result/mortality' 
config_path = 'resources/discretizer_config.json'
output_dir = 'result'

In [3]:
partition = 'train'
timestep = 1.0
task = 'ihm'
impute_strategy = 'previous'
start_time = 'zero'
store_masks=True

In [4]:
'''
在父类里面完成"partition"_listfile.csv读取操作
'''
class Reader(object):
    def __init__(self, dataset_dir='scripts/data/in-hospital-mortality', partition='train', listfile=None):
        self.dataset = dataset_dir
        self.current_index = 0
        self.partition = partition
        if listfile is None:
            listfile_path = os.path.join(dataset_dir, self.partition+'_listfile.csv')
        else:
            listfile_path = listfile
        # 获取"partition"_listfile.csv的内容
        # 形式： *.csv, y_true
        self.data = pd.read_csv(listfile_path)
        self.data['y_true'] = self.data['y_true'].astype('int')
        self.header = self.data.columns.values.tolist()
    def get_number_of_examples(self):
        return self.data.shape[0]
    def random_shuffle(self, seed=None):
        # 如果是训练集，打乱数据
        if self.partition == 'train':
            self.data = self.data.sample(frac=1.0)
    #  reads the sample with the given index
    def read_sample(self, index):
        raise NotImplementedError()
    #  read_next reads the next sample by using a cyclic counter inside
    def read_next(self):
        to_read_index = self.current_index
        self.current_index += 1
        if self.current_index == self.get_number_of_examples():
            self.current_index = 0
        return self.read_sample(to_read_index)
    

In [5]:
""" Reader for in-hospital moratality prediction task.

:param dataset_dir:   Directory where timeseries files are stored.
:param listfile:      Path to a listfile. If this parameter is left `None` then
                        `dataset_dir/listfile.csv` will be used.
:param period_length: Length of the period (in hours) from which the prediction is done.
"""
class InHospitalMortalityReader(Reader):
    def __init__(self, dataset_dir='scripts/data/in-hospital-mortality', partition='train', period_length=48.0):
        Reader.__init__(self, dataset_dir, listfile=None)
        # self.data是父类读取的"partition"_listfile.csv的内容
        self.data_x = self.data['patient_id_i'].values.tolist()
        self.data_y = self.data['y_true'].values.tolist()
        self.period_length = period_length
        #print(self.data_x)
    # 读取self.data_x的csv文件，提取里面的17个变量信息
    def read_timeseries(self, ts_filename):
        ret = []
        timeseries = pd.read_csv(ts_filename)
        timeseries = timeseries.fillna('')
        header = timeseries.columns.values.tolist()
        assert header[0] == 'Hours'
        for index, row in timeseries.iterrows():
            variables = [row[header[i]] for i in range(len(header))]
            ret.append(np.array(variables))
        return (np.stack(ret), header)
    def read_example(self, index):
        """ 
        Read the example with given index.

            :param index: Index of the line of the listfile to read (counting starts from 0).
            :return: Directory with the following keys:
                X : np.array
                    2D array containing all events. Each row corresponds to a moment.
                    First column is the time and other columns correspond to different
                    variables.
                t : float
                    Length of the data in hours. Note, in general, it is not equal to the
                    timestamp of last event.
                y : int (0 or 1)
                    Mortality within next 24 hours.
                header : array of strings
                    Names of the columns. The ordering of the columns is always the same.
                name: Name of the sample.
        """
        if index < 0 or index >= len(self.data):
            return ValueError("Index must be from 0 (inclusive) to number of examples (exclusive).")
        name = self.data_x[index]
        name_path = os.path.join(dataset_dir, partition, name)
        y = self.data_y[index]
        t = self.period_length
        (X, header) = self.read_timeseries(name_path)
        return {
            'X': X,
            't': t,
            'y': y,
            'header': header,
            'name': name
        }



In [49]:
""" #train_reader.read_example(0)
print(train_reader.get_number_of_examples())
example = train_reader.read_example(0)['X']
d, h = discretizer.transform(example, end=48.0)
 """

" #train_reader.read_example(0)\nprint(train_reader.get_number_of_examples())\nexample = train_reader.read_example(0)['X']\nd, h = discretizer.transform(example, end=48.0)\n "

In [7]:
train_reader = InHospitalMortalityReader(dataset_dir=dataset_dir, partition='train')
n_samples = train_reader.get_number_of_examples()
# create the discretizer
discretizer = Discretizer(timestep=timestep,
                            store_masks=store_masks,
                            impute_strategy=impute_strategy,
                            start_time=start_time)
# 找离散变量所在的下标
discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',')
continuous_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1]
#print(continuous_channels)
normalizer = Normalizer(fields=continuous_channels)

In [9]:
for i in tqdm(range(n), desc='Iterating over train patients\' icu stays'):
    ret = train_reader.read_example(i)
    #print(ret)
    data, new_header = discretizer.transform(ret['X'], end=ret['t'])
    #print(data[0])
    normalizer._feed_data(data)

file_name = '{}_ts_{:.2f}_impute_{}_start_{}_masks_{}_n_{}_normalizer.pickle'.format(
       task, timestep, impute_strategy, start_time, store_masks, n_samples)
file_name = os.path.join(output_dir, file_name)
print('\n Saving the state in {} ...'.format(file_name))
normalizer._save_params(file_name)
print('saved!')

Iterating over train patients' icu stays: 100%|██████████| 5/5 [00:00<00:00, 21.95it/s]
 Saving the state in result\ihm_ts_1.00_impute_previous_start_zero_masks_True_n_14803_normalizer.pickle ...
saved!



In [8]:
import pickle
load_file_path = os.path.join(output_dir, 'ihm_ts_1.00_impute_previous_start_zero_masks_True_n_14803_normalizer.pickle')
with open(load_file_path, "rb") as load_file:
    dct = pickle.load(load_file, encoding='latin1')
print(len(dct))
means = dct['means']
stds = dct['stds']
print(len(means))
for i in continuous_channels :
    print(means[i])

2
76
60.28333333333333
0.27108333333333345
115.4875
89.50833333333333
170.0
77.93055408795675
98.01666666666667
21.370833333333334
117.92916666666666
36.943203373661746
77.22818382666668
7.348416666666668
