# Feature extraction
At this stage, the raw data should be transform into a standard time series format. This codes is to first transform the data into a "cesium" format, and then extract the feature from it.

In [9]:
#load libraries
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from cesium import featurize

#specify the data dir
data_file = '/Users/leeo/Desktop/KI2/7.master_thesis/1.data/4.ts_format/17273_ts/17273_14-11-17.csv'

In [10]:
data = pd.read_csv(data_file, header=0, index_col=0)
data.index = pd.to_datetime(data.index)
data.head()

Unnamed: 0_level_0,HR,Resp,Comments
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-11-14 08:30:53.761200,161.2035,2.046443,
2017-11-14 08:30:54.261200,149.3424,2.9313,
2017-11-14 08:30:54.761200,139.5089,2.540741,
2017-11-14 08:30:55.261200,170.194,2.610959,
2017-11-14 08:30:55.761200,162.9608,2.745577,


## Data slice : slice with windows

In [11]:
#check the comments for label
data["Comments"][data["Comments"].isnull()==False]

Date_Time
2017-11-14 08:43:00.261200                  #* 600ng LPS 
2017-11-14 09:07:52.261200    #* finish giving 600ng lps 
Name: Comments, dtype: object

<img src="pictures/data_slicing_concepts.jpg" width="800" height="400">

In [12]:
t0 = pd.to_datetime("2017-11-14 08:43:00.261200") #starting time for sepsis
delta_t = pd.Timedelta("1T")
time_frame = pd.Timedelta("10T")
time_window = pd.Timedelta("1h")
n = int((time_window - time_frame)/delta_t +1)
t1 = t0 + pd.Timedelta("1.5h") #starting time for sepsis recover

print("The starting time of sepsis status is {}, we choose {} \nas time window, where each time frame rolls in {}, and therefore there is \n{} time frames in total ".format(
t0, time_window, delta_t, n))

The starting time of sepsis status is 2017-11-14 08:43:00.261200, we choose 0 days 01:00:00 
as time window, where each time frame rolls in 0 days 00:01:00, and therefore there is 
51 time frames in total 


In [13]:
cesium_times = []
cesium_values = []
for i in range(n):
    t_0_start = t0 + delta_t*i
    t_0_stop = t_0_start + time_frame - pd.Timedelta("0.5s")
    cesium_df = data[t_0_start:t_0_stop]
    cesium_t = cesium_df.loc[t_0_start:t_0_stop].index.to_numpy()
    cesium_HR = cesium_df.loc[t_0_start:t_0_stop].HR.to_numpy()
    cesium_Resp = cesium_df.loc[t_0_start:t_0_stop].Resp.to_numpy()
    
    #list for labels?
    cesium_values.append(np.array([cesium_HR,cesium_Resp]))
    cesium_times.append(np.array([cesium_t.astype("float"),cesium_t.astype("float")]))
if len(cesium_values) != n:
    raise Exception('The time window is incontinous, please check the experiment data.')
if cesium_values[0].shape != (2, time_frame.total_seconds()/pd.Timedelta("0.5s").total_seconds()):
    raise Exception('Check the HR or Resp values, if there any missing values, or you forget to include one of them.')
    
print("Within the {} of cesium_values, there are {} objects, and each object is a {} with the shape of {}. Example: \n{}".format(
type(cesium_values), len(cesium_values), type(cesium_values[0]), cesium_values[0].shape,cesium_values[0]))

Within the <class 'list'> of cesium_values, there are 51 objects, and each object is a <class 'numpy.ndarray'> with the shape of (2, 1200). Example: 
[[162.9751    163.9367    145.1801    ... 177.4922    174.2096
  173.2778   ]
 [  0.7797833   2.247328    1.728741  ...   4.206926    4.13492
    4.254256 ]]
