In [1]:
import torch
from Services.masking_service import make_missing_mask, make_indicating_mask
from Data.mimic.data_fetcher import TimeSeriesData, load_initialized_dataset
import dill
import Data.mimic.mimic
import numpy as np

## load data


only run 1 of the following variants
if no variant fits the need, create a new one that outputs the data variable in the same way

In [2]:
# load the given data in such a way, that the variable data can be cast into a torch tensor
# variant 1: our implementation for all ICU stays
with open(fr'C:\Users\DHLD\Desktop\Imputation\data-imputation-icu\Data\mimic\dataset_48_1.p', 'rb') as f:
    data = dill.load(f)
data
len(data)

61532

In [2]:
# load the given data in such a way, that the variable data can be cast into a torch tensor
# variant 2: our implementation for only heart stays
data = np.load(r'C:\Users\DHLD\Desktop\Imputation\data-imputation-icu\Data\mimic\dataset_48_1_0510_heart_arr.npy')
data.shape

(16565, 10, 48)

In [7]:
# creating a torch tensor that hold our data; the data format should be: D[i,j,k] should denote the k-th feature measued at j.th time of the i.th time series
D_unnormalized = torch.tensor(data[:52485], dtype= torch.float32)
D_unnormalized = torch.transpose(D_unnormalized,1,2)[:,:,0:4] # check if this is necessary
n_data = D_unnormalized.shape[0]
D_unnormalized.shape, n_data

(torch.Size([52485, 48, 4]), 52485)

In [8]:
# shuffling the datapoints
random_permutation = torch.randperm(D_unnormalized.shape[0])
D_unnormalized = D_unnormalized[random_permutation]
D_unnormalized.shape

torch.Size([52485, 48, 4])

## normalizing

In [9]:
# normalize featurewise such that for every feature: mean = 0, std = 1
D_mean = torch.nanmean(D_unnormalized,[0,1])
std = torch.sqrt(torch.nanmean((D_unnormalized-D_mean)*(D_unnormalized-D_mean), [0,1]))
D = (D_unnormalized- D_mean)/std

In [10]:
D_mean = torch.nanmean(D,[0,1])
D_std = torch.sqrt(torch.nanmean((D-D_mean)*(D-D_mean), [0,1]))
D_mean, D_std

(tensor([-1.3033e-04,  3.3337e-06,  5.2325e-06, -2.7595e-07]),
 tensor([1.0000, 1.0000, 1.0000, 1.0000]))

## splitting

In [11]:
train_set =D[0: int(n_data*0.6)]
test_set =D[int(n_data*0.6):int(n_data * 0.8)]
validation_set =D[int(n_data*0.8):]
train_set.shape, test_set.shape, validation_set.shape

(torch.Size([31491, 48, 4]),
 torch.Size([10497, 48, 4]),
 torch.Size([10497, 48, 4]))

## masking

In [12]:
# fill this out

# set parameter for indicating masking
p = 0.1

In [13]:
missing_mask_train = make_missing_mask(train_set)
indicating_mask_train = make_indicating_mask(missing_mask_train,p)
missing_mask_test = make_missing_mask(test_set)
indicating_mask_test = make_indicating_mask(missing_mask_test,p)
missing_mask_validation = make_missing_mask(validation_set)
indicating_mask_validation = make_indicating_mask(missing_mask_validation,p)
# missing mask doesn't know about the indicated values; the following missing mask respects them
missing_mask_tilde_train = missing_mask_train - indicating_mask_train
missing_mask_tilde_test = missing_mask_test - indicating_mask_test
missing_mask_tilde_validation = missing_mask_validation - indicating_mask_validation

In [14]:
# setting NaN to zero
train_set[torch.isnan(train_set)] = 0
test_set[torch.isnan(test_set)] = 0
validation_set[torch.isnan(validation_set)] = 0

## saving

In [15]:
# fill this out

# some description of this data tuple if necessary
description = 'The data is normalized featurewise; missing mask respects indicated values; data is shuffled; first feature block- 4 features & 48 timestamps; all ICU stays, dtype= torch.float32'

name_to_save = 'data_all_ICU_first_four_features_shuffled'

In [16]:
data_dic = {
    'train_set': train_set,
    'missing_mask_train': missing_mask_tilde_train,
    'indicating_mask_train': indicating_mask_train,
    'test_set': test_set,
    'missing_mask_test': missing_mask_tilde_test,
    'indicating_mask_test': indicating_mask_test,
    'validation_set': validation_set,
    'missing_mask_validation': missing_mask_tilde_validation,
    'indicating_mask_validation': indicating_mask_validation,
    'percentage_indicating': p,
    'description': description
}
torch.save(data_dic, name_to_save)