# PREAMBLE

In [3]:
%load_ext autoreload
%autoreload 2
import sys
import os
import matplotlib.pyplot as plt
import pyximport
import pandas as pd
import numpy as np
import gzip
import pickle
from datetime import datetime as dt
codebase_path = os.path.abspath('..')
sys.path.append(codebase_path)
sys.path.append('.')

from data_utils import *
pyximport.install()
from helpers.load import load_data, load_pickle  # noqa: E402
from helpers.save import save  # noqa: E402
from definitions.path import data_interim, data_processed  # noqa: E402
raw_df_path = os.path.join(data_interim, 'patient_timeseries_small.csv')


# LOAD PHYSIO & OBSERVATION DATA

In [2]:
df_raw = load_data('patient_timeseries_small.p', data_interim)

In [4]:
with open ('./data/obs_grps.pkl', 'rb') as f:
    obs_df = pickle.load(f)

# THE DATA PROCESSING PROCEDURE
STARTING FROM RAW PHYSIO MARKES AND ENDING WITH MODEL TRAINING FEATURE ARRAYS

#### 1. LOAD DATA PROCESSING CLASSES

In [6]:
processor = process_raw() # PROCESSING RAW PHYSIO DATA
constructor = construct_features() # ENGINEER STATISTICAL FEATURES

#### 2. SEPARATE THE CASE AND CONTOL PATIENTS AND THE SUBSET OF EFFECTIVE PHSYIO FEATURES
INPUTS ARE THE RAW TIMESERIES PATIENT DATA AND THE OBSERVATIONS DATA.

In [None]:
case_df, control_df = processor.ctrl_test_dfs(df_raw, obs_df)  # MEMORY INTENSTIVE00

In [5]:
case_df = pd.read_csv('./data/case_df.csv', index_col= 0)
control_df = pd.read_csv('./data/control_df.csv', index_col= 0)
control_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11335492 entries, 1678 to 3395784
Data columns (total 13 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   measurement_datetime                  float64
 1   hours_since_birth                     float64
 2   Monitor Arteriele Bloeddruk Diastole  float64
 3   event                                 object 
 4   Monitor Arteriele Bloeddruk Mean      float64
 5   Monitor Arteriele Bloeddruk Systole   float64
 6   Monitor Hartfrequentie                float64
 7   Monitor Ademhalingsfrequentie         float64
 8   Monitor O2 Saturatie                  float64
 9   Couveuse Gemeten Temp                 float64
 10  Monitor Temperatuur 1                 float64
 11  Monitor Hartfrequentie Pulse          float64
 12  Monitor Hartfrequentie Pleth          float64
dtypes: float64(12), object(1)
memory usage: 1.2+ GB


In [129]:
case_df.shape , control_df.shape

((14057511, 13), (11335492, 13))

#### 3. EXTRACT THE HORIZON SEGMENT PER CASE AND CONTROL.
Extracts the 12hrs preceeding `t_sepsis` and `t_control` per patient.

In [7]:
case_horizon, control_horizon = processor.subset_horizon(case_df, control_df)

#### Note: All case patients' timeseries ends with a postive bloodculture event (aka. t_sepsis). Ex: patient 4611

In [11]:
case_horizon.loc[4611] 

Unnamed: 0_level_0,measurement_datetime,hours_since_birth,Monitor Arteriele Bloeddruk Diastole,event,Monitor Arteriele Bloeddruk Mean,Monitor Arteriele Bloeddruk Systole,Monitor Hartfrequentie,Monitor Ademhalingsfrequentie,Monitor O2 Saturatie,Couveuse Gemeten Temp,Monitor Temperatuur 1,Monitor Hartfrequentie Pulse,Monitor Hartfrequentie Pleth
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4611,3870.0,64.500000,,,,,155.0,83.0,95.400002,30.000000,37.450001,,
4611,3871.0,64.516667,,,,,157.0,89.0,94.000000,30.000000,37.450001,,
4611,3872.0,64.533333,,,,,158.0,87.0,94.300003,30.000000,37.450001,,
4611,3873.0,64.550000,,,,,158.0,85.0,93.599998,30.000000,37.459999,,
4611,3874.0,64.566667,,,,,159.0,77.0,92.900002,30.000000,37.450001,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4611,4586.0,76.433333,,,,,174.0,50.0,99.400002,29.299999,36.290001,,
4611,4587.0,76.450000,,,,,162.0,63.0,99.599998,29.400000,36.279999,,
4611,4588.0,76.466667,,,,,177.0,44.0,99.699997,29.500000,36.290001,,
4611,4589.0,76.483333,,,,,179.0,40.0,99.500000,29.400000,36.290001,,


#### 4. CONSTRUCTS STATISTICAL FEATURE VECTOR PER PATIENT.
SUMMARIZE EACH PATIENT'S RECORD BY AGGREGATING 8 STATISTICAL FEATURES THROUHG A SLIDING WINDOW OF 3HRS.<br>
Each patient has 403 features.

In [13]:
case_features = constructor.feature_df(case_horizon, obs_df)
case_features

Unnamed: 0_level_0,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,...,Int_4_var,Int_4_var,Int_4_var,Int_4_var,Int_4_var,Int_4_var,Int_4_var,profile,profile,profile
Unnamed: 0_level_1,Couveuse Gemeten Temp,Monitor Ademhalingsfrequentie,Monitor Arteriele Bloeddruk Diastole,Monitor Arteriele Bloeddruk Mean,Monitor Arteriele Bloeddruk Systole,Monitor Hartfrequentie,Monitor Hartfrequentie Pleth,Monitor Hartfrequentie Pulse,Monitor O2 Saturatie,Monitor Temperatuur 1,...,Monitor Arteriele Bloeddruk Mean,Monitor Arteriele Bloeddruk Systole,Monitor Hartfrequentie,Monitor Hartfrequentie Pleth,Monitor Hartfrequentie Pulse,Monitor O2 Saturatie,Monitor Temperatuur 1,ga,Female,Male
patient_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
4611,33.272855,1.380111,,,,-0.957576,,,0.332519,-0.021523,...,,,,,,,,31,1,0
11621,23.052150,0.114882,,,,13.004031,,,7.799662,13.142368,...,,,,,,,,28,0,1
23590,-1.638770,-0.317602,,,,29.289697,,,1.933881,5.377787,...,,,,,,,,30,1,0
26695,22.415269,0.288914,,,,3.860096,,,9.151796,147.311758,...,,,,,,,,30,0,1
29830,7.257185,-0.730481,,,,11.076638,,,57.192883,-0.074324,...,,,,,,,,32,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3370016,-0.818942,-0.153506,3.029857,3.146477,3.109573,0.599118,0.282275,0.753929,0.744220,-1.650951,...,,,,,,,,30,0,1
3371533,3.129030,-0.453941,,,,-0.191043,-0.291992,,0.312625,2.448921,...,,,,,,,,31,0,1
3384445,,-0.160036,,,,1.721785,1.227506,,4.854467,0.461009,...,,,,,,,,32,0,1
3385344,12.618330,-0.830147,-0.089005,-0.253868,0.286699,-0.206766,-0.422934,10.804369,6.647996,-0.267273,...,,,,,,,,28,0,1


In [14]:
control_features = constructor.feature_df(control_horizon, obs_df)
control_features

Unnamed: 0_level_0,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,...,Int_4_var,Int_4_var,Int_4_var,Int_4_var,Int_4_var,Int_4_var,Int_4_var,profile,profile,profile
Unnamed: 0_level_1,Couveuse Gemeten Temp,Monitor Ademhalingsfrequentie,Monitor Arteriele Bloeddruk Diastole,Monitor Arteriele Bloeddruk Mean,Monitor Arteriele Bloeddruk Systole,Monitor Hartfrequentie,Monitor Hartfrequentie Pleth,Monitor Hartfrequentie Pulse,Monitor O2 Saturatie,Monitor Temperatuur 1,...,Monitor Arteriele Bloeddruk Mean,Monitor Arteriele Bloeddruk Systole,Monitor Hartfrequentie,Monitor Hartfrequentie Pleth,Monitor Hartfrequentie Pulse,Monitor O2 Saturatie,Monitor Temperatuur 1,ga,Female,Male
patient_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1678,5.451859,-0.963789,0.513203,0.996446,0.481717,18.148413,,,-0.727413,-0.686097,...,,,,,,,,30,0,1
1809,6.104629,,0.036746,0.052493,0.049107,,,,4.861892,0.114428,...,,,,,,,,28,1,0
14404,11.654846,,,,,,,,,,...,,,,,,,,27,0,1
24188,0.042218,-0.437840,,,,-0.187679,,,6.787525,24.479886,...,,,,,,,,31,0,1
34870,33.857598,-1.022441,3.797997,3.320847,0.525998,1.697341,,,10.122511,3.428985,...,,,,,,,,31,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363458,3.187841,-1.069018,-0.206113,-0.513845,-0.444289,2.157987,2.387060,3.558015,17.410620,13.906978,...,,,,,,,,28,1,0
3372236,6.572519,0.821376,0.867421,0.008326,-0.049627,2.320742,2.021299,2.347526,4.631491,1.346616,...,,,,,,,,29,0,1
3377022,4.281828,-0.899166,54.173907,57.155478,60.256593,1.881823,8.738297,0.981761,5.117051,-0.558639,...,,,,,,,,29,1,0
3393381,10.283293,-0.743255,,,,3.327318,4.197549,,37.102160,1.827201,...,,,,,,,,28,0,1


In [14]:
control_features.columns.to_list()

[('Int_0_kurt', 'Couveuse Gemeten Temp'),
 ('Int_0_kurt', 'Monitor Ademhalingsfrequentie'),
 ('Int_0_kurt', 'Monitor Arteriele Bloeddruk Diastole'),
 ('Int_0_kurt', 'Monitor Arteriele Bloeddruk Mean'),
 ('Int_0_kurt', 'Monitor Arteriele Bloeddruk Systole'),
 ('Int_0_kurt', 'Monitor Hartfrequentie'),
 ('Int_0_kurt', 'Monitor Hartfrequentie Pleth'),
 ('Int_0_kurt', 'Monitor Hartfrequentie Pulse'),
 ('Int_0_kurt', 'Monitor O2 Saturatie'),
 ('Int_0_kurt', 'Monitor Temperatuur 1'),
 ('Int_0_max', 'Couveuse Gemeten Temp'),
 ('Int_0_max', 'Monitor Ademhalingsfrequentie'),
 ('Int_0_max', 'Monitor Arteriele Bloeddruk Diastole'),
 ('Int_0_max', 'Monitor Arteriele Bloeddruk Mean'),
 ('Int_0_max', 'Monitor Arteriele Bloeddruk Systole'),
 ('Int_0_max', 'Monitor Hartfrequentie'),
 ('Int_0_max', 'Monitor Hartfrequentie Pleth'),
 ('Int_0_max', 'Monitor Hartfrequentie Pulse'),
 ('Int_0_max', 'Monitor O2 Saturatie'),
 ('Int_0_max', 'Monitor Temperatuur 1'),
 ('Int_0_mean', 'Couveuse Gemeten Temp'),
 ('I

#### 5. CONSTRUCT THE FINAL MODELING DATA WITH LABELS

In [17]:
modeling_set = constructor.modeling_set(case_features, control_features)
modeling_set

Unnamed: 0_level_0,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,Int_0_kurt,...,Int_4_var,Int_4_var,Int_4_var,Int_4_var,Int_4_var,Int_4_var,profile,profile,profile,label
Unnamed: 0_level_1,Couveuse Gemeten Temp,Monitor Ademhalingsfrequentie,Monitor Arteriele Bloeddruk Diastole,Monitor Arteriele Bloeddruk Mean,Monitor Arteriele Bloeddruk Systole,Monitor Hartfrequentie,Monitor Hartfrequentie Pleth,Monitor Hartfrequentie Pulse,Monitor O2 Saturatie,Monitor Temperatuur 1,...,Monitor Arteriele Bloeddruk Systole,Monitor Hartfrequentie,Monitor Hartfrequentie Pleth,Monitor Hartfrequentie Pulse,Monitor O2 Saturatie,Monitor Temperatuur 1,ga,Female,Male,Unnamed: 21_level_1
patient_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
3000905,13.220986,-0.695071,,,,2.560210,2.863728,,4.139940,1.401521,...,,,,,,,30,1,0,1
1374974,-1.742137,,,,,,,,,,...,,,,,,,30,1,0,0
3072442,41.191212,-0.259215,,,,0.489788,0.524500,,37.266095,5.488029,...,,,,,,,33,1,0,0
3049134,2.657251,11.620385,,,,-0.277844,-0.275088,,1.452399,-1.152018,...,,,,,,,24,0,1,1
702346,-0.837810,,,,,,,,3.915345,,...,,,,,,,27,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3109678,4.713343,-0.490114,,,,43.287897,11.343593,,53.949290,25.018673,...,,,,,,,28,0,1,1
1858350,-0.516409,-0.508673,-0.301509,-0.656022,-0.194166,-0.148049,,,-0.551620,17.207970,...,,,,,,,27,1,0,1
2924962,0.868535,-0.179619,,,,10.162363,11.704816,,7.999541,65.161975,...,,,,,,,25,1,0,1
2960446,,3.187489,7.976470,9.607011,3.283504,-0.518386,-0.469863,20.144104,-0.168670,0.721675,...,,,,,,,32,1,0,0


#### 6. ARRAYS FOR MODEL DEVELOPMENT

In [19]:
X, y = constructor.training_arrays(modeling_set)
X.shape, y.shape

((792, 403), (792,))