# Feature extraction
At this stage, the raw data should be transform into a standard time series format. This codes is to first transform the data into a "cesium" format, and then extract the feature from it.

In [9]:
#load libraries
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from cesium import featurize

#specify the data dir
data_file = '/Users/leeo/Desktop/KI2/7.master_thesis/1.data/4.ts_format/17273_ts/17273_14-11-17.csv'

In [10]:
data = pd.read_csv(data_file, header=0, index_col=0)
data.index = pd.to_datetime(data.index)
data.head()

Unnamed: 0_level_0,HR,Resp,Comments
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-11-14 08:30:53.761200,161.2035,2.046443,
2017-11-14 08:30:54.261200,149.3424,2.9313,
2017-11-14 08:30:54.761200,139.5089,2.540741,
2017-11-14 08:30:55.261200,170.194,2.610959,
2017-11-14 08:30:55.761200,162.9608,2.745577,


## Data slice : slice with windows

In [11]:
#check the comments for label
data["Comments"][data["Comments"].isnull()==False]

Date_Time
2017-11-14 08:43:00.261200                  #* 600ng LPS 
2017-11-14 09:07:52.261200    #* finish giving 600ng lps 
Name: Comments, dtype: object

<img src="pictures/data_slicing_concepts.jpg" width="800" height="400">

In [12]:
t0 = pd.to_datetime("2017-11-14 08:43:00.261200") #starting time for sepsis
delta_t = pd.Timedelta("1T")
time_frame = pd.Timedelta("10T")
time_window = pd.Timedelta("1h")
n = int((time_window - time_frame)/delta_t +1)
t1 = t0 + pd.Timedelta("1.5h") #starting time for sepsis recover

print("The starting time of sepsis status is {}, we choose {} \nas time window, where each time frame rolls in {}, and therefore there is \n{} time frames in total ".format(
t0, time_window, delta_t, n))

The starting time of sepsis status is 2017-11-14 08:43:00.261200, we choose 0 days 01:00:00 
as time window, where each time frame rolls in 0 days 00:01:00, and therefore there is 
51 time frames in total 


In [13]:
cesium_times = []
cesium_values = []
for i in range(n):
    t_0_start = t0 + delta_t*i
    t_0_stop = t_0_start + time_frame - pd.Timedelta("0.5s")
    cesium_df = data[t_0_start:t_0_stop]
    cesium_t = cesium_df.loc[t_0_start:t_0_stop].index.to_numpy()
    cesium_HR = cesium_df.loc[t_0_start:t_0_stop].HR.to_numpy()
    cesium_Resp = cesium_df.loc[t_0_start:t_0_stop].Resp.to_numpy()
    
    #list for labels?
    cesium_values.append(np.array([cesium_HR,cesium_Resp]))
    cesium_times.append(np.array([cesium_t.astype("float"),cesium_t.astype("float")]))
if len(cesium_values) != n:
    raise Exception('The time window is incontinous, please check the experiment data.')
if cesium_values[0].shape != (2, time_frame.total_seconds()/pd.Timedelta("0.5s").total_seconds()):
    raise Exception('Check the HR or Resp values, if there any missing values, or you forget to include one of them.')
    
print("Within the {} of cesium_values, there are {} objects, and each object is a {} with the shape of {}. Example: \n{}".format(
type(cesium_values), len(cesium_values), type(cesium_values[0]), cesium_values[0].shape,cesium_values[0]))

Within the <class 'list'> of cesium_values, there are 51 objects, and each object is a <class 'numpy.ndarray'> with the shape of (2, 1200). Example: 
[[162.9751    163.9367    145.1801    ... 177.4922    174.2096
  173.2778   ]
 [  0.7797833   2.247328    1.728741  ...   4.206926    4.13492
    4.254256 ]]


In [20]:
#set the feature wants to extract
features_to_use = ["amplitude",
                   "percent_beyond_1_std",
                   "maximum",
                   "max_slope",
                   "median",
                   "median_absolute_deviation",
                   "percent_close_to_median",
                   "minimum" ,
                   "skew",
                   "std",
                   "weighted_average"]

fset_cesium = featurize.featurize_time_series(times = cesium_times,
                                              values = cesium_values,
                                              features_to_use=features_to_use)
fset_cesium


feature,amplitude,amplitude,percent_beyond_1_std,percent_beyond_1_std,maximum,maximum,max_slope,max_slope,median,median,...,percent_close_to_median,percent_close_to_median,minimum,minimum,skew,skew,std,std,weighted_average,weighted_average
channel,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
0,7351.956105,6.768671,0.0025,0.325833,14721.66,8.871788,2.9197e-05,1.520731e-08,163.7907,2.507146,...,0.9975,0.381667,17.74779,-4.665553,22.587102,0.144745,548.493875,2.060524,200.878613,2.81719
1,2789.075605,6.768671,0.019167,0.348333,5595.899,8.871788,1.088364e-05,1.520731e-08,164.86805,3.40198,...,0.999167,0.449167,17.74779,-4.665553,29.460544,-0.115664,165.340725,1.97945,179.443505,3.09903
2,2789.075605,6.768671,0.015833,0.345833,5595.899,8.871788,1.088364e-05,1.520731e-08,166.28755,3.492204,...,0.999167,0.529167,17.74779,-4.665553,30.609503,-0.149773,163.202338,1.801013,177.907075,3.361806
3,1909.627605,5.264268,0.041667,0.338333,3837.003,8.871788,7.316356e-06,7.16431e-09,168.02605,3.58211,...,0.995,0.546667,17.74779,-1.656748,25.306486,-0.163399,117.834733,1.644059,180.486827,3.570893
4,4949.475105,5.264268,0.010833,0.2875,9916.698,8.871788,1.947815e-05,7.16431e-09,169.9622,3.732306,...,0.998333,0.655833,17.74779,-1.656748,28.270165,-0.284702,305.787055,1.50039,190.818426,3.765719
5,4918.49044,4.936377,0.011667,0.21,9916.698,8.871788,1.947815e-05,7.16431e-09,172.43605,3.992718,...,0.998333,0.734167,79.71712,-1.000966,28.195309,0.035662,305.935326,1.221478,194.748604,4.107706
6,4918.49044,3.628561,0.009167,0.2725,9916.698,8.871788,1.947815e-05,6.674199e-09,174.88805,4.152052,...,0.998333,0.583333,79.71712,1.614667,28.493333,1.033471,304.861923,0.952122,194.204694,4.272985
7,4946.72743,2.794732,0.008333,0.311667,9916.698,7.340808,1.947815e-05,6.674199e-09,174.62365,4.078501,...,0.998333,0.52,23.24314,1.751343,28.660882,0.800401,304.32762,0.73447,191.929191,4.117855
8,4946.72743,2.578297,0.005,0.340833,9916.698,6.907937,1.947815e-05,4.730308e-09,175.0224,4.002688,...,0.9975,0.5275,23.24314,1.751343,24.864896,0.637284,326.985173,0.677099,194.208816,4.063197
9,4946.72743,2.578297,0.005,0.3275,9916.698,6.907937,1.947815e-05,4.730308e-09,175.0744,3.970547,...,0.9975,0.5625,23.24314,1.751343,24.859109,0.689589,327.008084,0.66615,194.307345,4.055483
