In [1]:
import pandas as pd
import os
import re
import numpy as np
import ujson as json
import dill
import pickle
from bottleneck import push

In [2]:
data_path = 'extracts/'
with open(os.path.join(data_path, '....pkl'), 'rb') as f:
    origional= pickle.load(f)

Functions used to normalize and renomrmalize the data

In [3]:
def renormalize(data, means, stds):
    renorms=[]
    for patient in data:
        r= patient*stds+ means
        renorms.append(r)
    renorm_full = np.stack(renorms, axis=0)
    return(renorm_full)
def normalize(data, means, stds):
    renorms=[]
    for patient in data:
        r= (patient-means) /stds
        renorms.append(r)
    renorm_full = np.stack(renorms, axis=0)
    return(renorm_full)


In [4]:
def create_individualized_missingness_mask(mask):
  np.set_printoptions(suppress=False, precision= 9)
  samples_len =mask.shape[0]
  time_steps = mask.shape[1]
  features = mask.shape[2]
  
  personalized_mask_full = np.empty(shape=[samples_len,time_steps,features])
  personalized_mask_patient = []
  personalized_mask_sample = np.ones(shape=[time_steps,features])
  for patient_mask in mask:
        num_measurments_per_feature = patient_mask.sum(axis=0)
        # for each patient mask
        tf=((num_measurments_per_feature)/time_steps)
        personalized_mask_patient.append(np.where(patient_mask == 0, tf, patient_mask))
    # stack all feature-specific patient masks tnto a 3d tensor
  personalized_mask_full = np.stack(personalized_mask_patient, axis=0)
  return(personalized_mask_full)

Create Binary Mask

In [5]:
mask=~np.isnan(origional)*1

Create Individualized Missingness Mask (IMM)

In [6]:
IMM= create_individualized_missingness_mask(mask)

Create Normalized Data

In [7]:
means = []
stds = []
flatten= origional.reshape(origional.shape[0]*origional.shape[1], origional.shape[2])
for i in range(origional.shape[2]):
    means.append(np.nanmean(flatten[:,i]))
    stds.append(np.nanstd(flatten[:,i]))
origional =normalize(origional,means,stds)

Last Observation Carried Forward (LOCF), 
all missing values with no prior measurments, are replaced with zero

In [8]:
LOCV =push(origional, axis=1)
LOCV= np.where(np.isnan(LOCV), 0, LOCV)

Zero Imputation

In [10]:
zero= np.where(np.isnan(origional), 0, origional)

In [11]:
zero.shape

(12000, 48, 35)

In [12]:
with open('extracts/zero_combined.pkl', 'wb') as outfile:
    dill.dump(zero, outfile, pickle.HIGHEST_PROTOCOL) 
with open('extracts/IMM_combined.pkl', 'wb') as outfile:
    dill.dump(IMM, outfile, pickle.HIGHEST_PROTOCOL) 
with open('extracts/mask_combined.pkl', 'wb') as outfile:
    dill.dump(mask, outfile, pickle.HIGHEST_PROTOCOL) 
with open('extracts/LOCV_combined.pkl', 'wb') as outfile:
    dill.dump(LOCV, outfile, pickle.HIGHEST_PROTOCOL) 
with open('extracts/origional_combined.pkl', 'wb') as outfile:
    dill.dump(origional, outfile, pickle.HIGHEST_PROTOCOL) 
with open('extracts/stds.pkl', 'wb') as outfile:
    dill.dump(stds, outfile, pickle.HIGHEST_PROTOCOL)    
with open('extracts/means.pkl', 'wb') as outfile:
    dill.dump(means, outfile, pickle.HIGHEST_PROTOCOL) 