# Feature extraction
At this stage, the raw data should be transform into a standard time series format. This codes is to first transform the data into a "cesium" format, and then extract the feature from it.

In [1]:
#load libraries
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from cesium import featurize

#specify the data dir
data_file = '/Users/leeo/Downloads/17273_13-11-17_ts.csv'

  data = yaml.load(f.read()) or {}


In [2]:
data = pd.read_csv(data_file, header=0, index_col=0)
data.index = pd.to_datetime(data.index)
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0_level_0,HR,Resp,Comments
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-11-13 10:21:49.355800,162.64,2.420671,
2017-11-13 10:21:49.855800,167.7724,1.473044,
2017-11-13 10:21:50.355800,167.3049,0.916979,
2017-11-13 10:21:50.855800,164.9741,2.693337,
2017-11-13 10:21:51.355800,170.3801,4.941797,


## Data slice - attempt 1: all in one

In [3]:
#tranform the datetimeindex into array
time_array = np.array(data.index.to_pydatetime(), dtype=np.datetime64)
#transform into a list of list of ndarrary

cesium_times = [np.array([time_array.astype("float"),time_array.astype("float")])]

#check the time format
print(type(cesium_times))#list
print(type(cesium_times[0])) #(2, 159112) np.array
print(type(cesium_times[0][0]))
print(type(cesium_times[0][0][0]))
print(cesium_times[0].shape)

<class 'list'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float64'>
(2, 159111)


In [4]:
#tranform the measurement values into cesium format
cesium_values =[np.array([data["HR"].to_numpy(),data["Resp"].to_numpy()])]

print(type(cesium_values))#list
print(type(cesium_values[0])) #array
print(type(cesium_values[0][0]))
print(type(cesium_values[0][0][0]))
print(cesium_values[0][0])

<class 'list'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float64'>
[162.64   167.7724 167.3049 ... 164.5599 161.2966 161.2966]


In [5]:
#set the feature wants to extract
features_to_use = ["amplitude",
                   "percent_beyond_1_std",
                   "maximum",
                   "max_slope",
                   "median",
                   "median_absolute_deviation",
                   "percent_close_to_median",
                   "minimum",
                   "skew",
                   "std",
                   "weighted_average"]

fset_cesium = featurize.featurize_time_series(times = cesium_times,
                                              values = cesium_values,
                                              features_to_use=features_to_use)
fset_cesium

feature,amplitude,amplitude,percent_beyond_1_std,percent_beyond_1_std,maximum,maximum,max_slope,max_slope,median,median,...,percent_close_to_median,percent_close_to_median,minimum,minimum,skew,skew,std,std,weighted_average,weighted_average
channel,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
0,7680.97217,21.236365,0.00291,0.261541,15371.98,17.52629,0.026259,5.4e-05,177.5802,2.831188,...,0.997593,0.983955,10.03566,-24.94644,24.569635,-0.119295,355.818913,1.560819,196.083506,2.765618


## Data slice - attempt 2: slice with windows

In [6]:
#check the comments for label
data["Comments"][data["Comments"].isnull()==False]

Date_Time
2017-11-13 10:40:53.355800                        #* LPS 300ng IV Jugular Vein 
2017-11-13 10:41:21.355800    #* Rob is looking very cute today in a tight s...
2017-11-13 12:17:16.855800                                                 #*  
2017-11-13 15:50:10.855800                                                 #*  
Name: Comments, dtype: object

<img src="pictures/data_slicing_concepts.jpg" width="800" height="400">

In [7]:
t0 = pd.to_datetime("2017-11-13 10:41:21.355800") #starting time for sepsis
delta_t = pd.Timedelta("1T")
time_frame = pd.Timedelta("10T")
time_window = pd.Timedelta("1h")
n = int((time_window - time_frame)/delta_t +1)
t1 = t0 + pd.Timedelta("1.5h") #starting time for sepsis recover
print("The starting time is {}, we choose {} as time window, \nwhere each time frame rolls in {}, and therefore there is \n{} time frames in total ".format(
t0, time_window, delta_t, n))
n

The starting time is 2017-11-13 10:41:21.355800, we choose 0 days 01:00:00 as time window, 
where each time frame rolls in 0 days 00:01:00, and therefore there is 
51 time frames in total 


51

In [8]:
cesium_times = []
cesium_values = []
for i in range(n):
    t_0_start = t0 + delta_t*i
    t_0_stop = t_0_start + time_frame - pd.Timedelta("0.5s")
    cesium_df = data[t_0_start:t_0_stop]
    cesium_t = cesium_df.loc[t_0_start:t_0_stop].index.to_numpy()
    cesium_HR = cesium_df.loc[t_0_start:t_0_stop].HR.to_numpy()
    cesium_Resp = cesium_df.loc[t_0_start:t_0_stop].Resp.to_numpy()
    
    cesium_values.append(np.array([cesium_HR,cesium_Resp]))
    cesium_times.append(np.array([cesium_t.astype("float"),cesium_t.astype("float")]))

print("Within the {} of cesium_values, there are {} objects, and each object is a {} with the shape of {}. Example: \n{}".format(
type(cesium_values), len(cesium_values), type(cesium_values[0]), cesium_values[0].shape,cesium_values[0]))

Within the <class 'list'> of cesium_values, there are 51 objects, and each object is a <class 'numpy.ndarray'> with the shape of (2, 1200). Example: 
[[ 76.69914   33.19289   33.19289  ... 171.1363   164.6466   163.6169  ]
 [  5.962028   6.459615   5.938771 ...   4.145388   4.027738   4.279074]]


In [9]:
#set the feature wants to extract
features_to_use = ["amplitude",
                   "percent_beyond_1_std",
                   "maximum",
                   "max_slope",
                   "median",
                   "median_absolute_deviation",
                   "percent_close_to_median",
                   "minimum",
                   "skew",
                   "std",
                   "weighted_average"]

fset_cesium = featurize.featurize_time_series(times = cesium_times,
                                              values = cesium_values,
                                              features_to_use=features_to_use)
fset_cesium

feature,amplitude,amplitude,percent_beyond_1_std,percent_beyond_1_std,maximum,maximum,max_slope,max_slope,median,median,...,percent_close_to_median,percent_close_to_median,minimum,minimum,skew,skew,std,std,weighted_average,weighted_average
channel,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
0,3096.71946,4.640335,0.005833,0.210833,6214.558,7.299869,1.20395e-05,3.541037e-09,172.78725,3.510196,...,0.996667,0.773333,21.11908,-1.980801,23.08135,0.348814,235.867822,0.935817,186.111328,3.673739
1,3096.71946,4.640335,0.005,0.185,6214.558,7.299869,1.20395e-05,3.541037e-09,170.8654,3.398261,...,0.998333,0.843333,21.11908,-1.980801,32.506157,0.229591,178.110347,0.794145,179.387787,3.504997
2,3096.71946,3.910178,0.003333,0.1975,6214.558,5.839556,1.20395e-05,3.541037e-09,169.40225,3.324583,...,0.998333,0.860833,21.11908,-1.980801,32.996929,-0.987469,177.216858,0.638025,177.935936,3.385825
3,3096.71946,3.910178,0.001667,0.186667,6214.558,5.839556,1.20395e-05,3.541037e-09,168.55905,3.324583,...,0.999167,0.874167,21.11908,-1.980801,33.822624,-1.042288,175.701636,0.623139,176.080177,3.368576
4,3096.71946,3.989851,0.000833,0.1725,6214.558,5.998902,1.20395e-05,3.541037e-09,168.1774,3.359297,...,0.999167,0.879167,21.11908,-1.980801,33.826939,-0.766593,175.699127,0.642961,175.869431,3.435898
5,3045.9181,3.989851,0.000833,0.171667,6214.558,5.998902,1.20395e-05,3.541037e-09,165.8394,3.432502,...,0.999167,0.883333,122.7218,-1.980801,34.154814,-0.970004,175.162695,0.630485,175.285913,3.477178
6,80.1698,2.396848,0.224167,0.2025,283.0614,5.998902,2.226582e-07,3.303224e-09,162.5304,3.448364,...,0.795833,0.773333,122.7218,1.205205,1.596555,1.126771,15.59841,0.528758,167.801534,3.538775
7,762.95759,2.396848,0.016667,0.196667,1564.887,5.998902,2.835513e-06,3.303224e-09,160.05655,3.443003,...,0.996667,0.795833,38.97182,1.205205,24.914998,1.204383,45.260677,0.510951,166.398597,3.526367
8,762.95759,3.245587,0.0175,0.17,1564.887,5.998902,2.835513e-06,6.340833e-09,159.3462,3.44284,...,0.996667,0.869167,38.97182,-0.492271,25.420931,0.334511,45.015901,0.547655,164.770906,3.523595
9,762.95759,3.245587,0.02,0.163333,1564.887,5.998902,2.835513e-06,6.731184e-09,157.6704,3.399628,...,0.995833,0.853333,38.97182,-0.492271,25.43021,0.381536,45.072772,0.611968,163.12938,3.501087


In [10]:
pwd

'/Users/leeo/Desktop/KI2/7.master thesis/4.github/master-thesis-project/codes'