# Feature extraction
At this stage, the raw data should be transform into a standard time series format. This codes is to first transform the data into a "cesium" format, and then extract the feature from it.

In [6]:
#load libraries
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from cesium import featurize

#specify the data dir
data_file = '/Users/leeo/Desktop/KI2/7.master_thesis/1.data/4.ts_format/17273_ts/17273_14-11-17.csv'

In [None]:
data = pd.read_csv(data_file, header=0, index_col=0)
data.index = pd.to_datetime(data.index)
data.head()

## Data slice : slice with windows

In [None]:
#check the comments for label
data["Comments"][data["Comments"].isnull()==False]

<img src="pictures/data_slicing_concepts.jpg" width="800" height="400">

In [None]:
t0 = pd.to_datetime("2017-11-14 08:43:00.261200") #starting time for sepsis
delta_t = pd.Timedelta("1T")
time_frame = pd.Timedelta("10T")
time_window = pd.Timedelta("1h")
n = int((time_window - time_frame)/delta_t +1)
t1 = t0 + pd.Timedelta("1.5h") #starting time for sepsis recover

print("The starting time of sepsis status is {}, we choose {} \nas time window, where each time frame rolls in {}, and therefore there is \n{} time frames in total ".format(
t0, time_window, delta_t, n))

In [None]:
cesium_times = []
cesium_values = []
for i in range(n):
    t_0_start = t0 + delta_t*i
    t_0_stop = t_0_start + time_frame - pd.Timedelta("0.5s")
    cesium_df = data[t_0_start:t_0_stop]
    cesium_t = cesium_df.loc[t_0_start:t_0_stop].index.to_numpy()
    cesium_HR = cesium_df.loc[t_0_start:t_0_stop].HR.to_numpy()
    cesium_Resp = cesium_df.loc[t_0_start:t_0_stop].Resp.to_numpy()
    
    #list for labels?
    cesium_values.append(np.array([cesium_HR,cesium_Resp]))
    cesium_times.append(np.array([cesium_t.astype("float"),cesium_t.astype("float")]))
if len(cesium_values) != n:
    raise Exception('The time window is incontinous, please check the experiment data.')
if cesium_values[0].shape != (2, time_frame.total_seconds()/pd.Timedelta("0.5s").total_seconds()):
    raise Exception('Check the HR or Resp values, if there any missing values, or you forget to include one of them.')
    
print("Within the {} of cesium_values, there are {} objects, and each object is a {} with the shape of {}. Example: \n{}".format(
type(cesium_values), len(cesium_values), type(cesium_values[0]), cesium_values[0].shape,cesium_values[0]))

In [None]:
#set the feature wants to extract
features_to_use = ["amplitude",
                   "percent_beyond_1_std",
                   "maximum",
                   "max_slope",
                   "median",
                   "median_absolute_deviation",
                   "percent_close_to_median",
                   "minimum" ,
                   "skew",
                   "std",
                   "weighted_average"]

fset_cesium = featurize.featurize_time_series(times = cesium_times,
                                              values = cesium_values,
                                              features_to_use=features_to_use)
fset_cesium


## Extrac the features for control group --checkpoint 5

In [23]:
import os
from glob import glob
PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/11.feature_label_intact/lps_saline/"
EXT = "*.csv"
all_csv_files = [file
                 for path, subdir, files in os.walk(PATH)
                 for file in glob(os.path.join(path, EXT))]

In [3]:
#check the data situation for the whole group
for f in all_csv_files:
    data = pd.read_csv(f, sep=",", header=0,
                           parse_dates=[0], index_col=0)
    #check the timeframe consistency
    interval_wrong=0
    #check the data situation
    for i in range(len(data)-1200):
        if data.index[i+1200]-data.index[i]!=pd.Timedelta("10T"):
            interval_wrong=interval_wrong+1
            #print("For no.{} interval, starts from {}, end at {}.".format(interval_wrong,data.index[i],data.index[i+1] ))
    percent=interval_wrong/len(data)
    print("The total wrong interval for {} is {}, account for {}.".format(f[-9:-4], interval_wrong, percent))

The total wrong interval for 18163 is 9600, account for 0.013608142205086044.


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [26]:
#set the parameters
delta_t = pd.Timedelta("1T")
time_frame = pd.Timedelta("20T")  #5, 10, 20min

#calculate the iteration index
rolling_index = int(delta_t/pd.Timedelta("0.5S"))
time_frame_index = int(time_frame/pd.Timedelta("0.5S"))

#set the feature wants to extract
feature_genral= ["amplitude", "max_slope", "mean", "maximum", "median", "minimum", "skew", "std"]
feature_cadence= ["avg_err", "all_times_nhist_numpeaks", "med_err", "std_err", 
                  "all_times_nhist_peak1_bin", "all_times_nhist_peak2_bin",
                  "cad_probs_1", "cad_probs_10"]
feature_lomb_scargle =["fold2P_slope_10percentile", "fold2P_slope_90percentile", "freq1_amplitude1",
                      "freq1_amplitude2", "freq1_lambda", "freq1_signif", "linear_trend",
                      "scatter_res_raw"]
features_to_use = feature_genral#+feature_cadence +feature_lomb_scargle



#export the feature data
for f in all_csv_files:
    #load the data
    data = pd.read_csv(f, sep=",", header=0,
                           parse_dates=[0], index_col=0)
    #prepare the cesium format
    x=0
    cesium_times = []
    cesium_values = []
    start_time = []
    end_time = []
    while x+time_frame_index<len(data):
        if data.index[x+time_frame_index]-data.index[x]== time_frame:
            #store the time for labeling
            start_time.append(data.index[x])
            end_time.append(data.index[x+time_frame_index])
        
            #tranform the data into cesium format
            cesium_t = data.index[x:x+time_frame_index].to_numpy()
            cesium_HR = data.HR[x:x+time_frame_index].to_numpy()
            cesium_Resp = data.Resp[x:x+time_frame_index].to_numpy()
            #store into numpy array
            cesium_values.append(np.array([cesium_HR,cesium_Resp]))
            cesium_times.append(np.array([cesium_t.astype("float"),cesium_t.astype("float")]))
        x=x+rolling_index
    if cesium_values[0].shape != (2, time_frame.total_seconds()/pd.Timedelta("0.5s").total_seconds()):
        raise Exception('Check the HR or Resp values, if there any missing values, or you forget to include one of them.')
    
    #extract the features
    fset_cesium = featurize.featurize_time_series(times = cesium_times,
                                              values = cesium_values,
                                              features_to_use=features_to_use)
    fset_cesium["start_time"]=start_time
    fset_cesium["end_time"]=end_time
    fset_cesium['label']=0
    #export the dataset
    #fset_cesium.to_csv(f[:-9]+"general_fe_"+f[-9:], header=True)
    fset_cesium.to_csv("/Users/leeo/Desktop/KI2/7.master_thesis/1.data/15.feature_label_intact_20min/lps_saline_fe/"+f[-9:],
                       header=True)
    

## Extrac the features for LPS/Saline group

In [52]:
import os
from glob import glob
PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/10.revised_object_data_combined_vc5/control_group/"
EXT = "*.csv"
all_csv_files = [file
                 for path, subdir, files in os.walk(PATH)
                 for file in glob(os.path.join(path, EXT))]

In [47]:
for f in all_csv_files:
    data = pd.read_csv(f, header=0, index_col=0)
    data.index = pd.to_datetime(data.index)
    #check the timeframe consistency
    interval_wrong=0
    #check the data situation
    for i in range(len(data)-1200):
        if data.index[i+1200]-data.index[i]!=pd.Timedelta("10T"):
            interval_wrong=interval_wrong+1
            #print("For no.{} interval, starts from {}, end at {}.".format(interval_wrong,data.index[i],data.index[i+1] ))
    percent=interval_wrong/len(data)
    print("The total wrong interval for {} is {}, account for {}.".format(f[-9:-4], interval_wrong, percent))
    

  interactivity=interactivity, compiler=compiler, result=result)


KeyboardInterrupt: 

In [53]:
#set the parameters
delta_t = pd.Timedelta("1T")
time_frame = pd.Timedelta("10T")

#calculate the iteration index
rolling_index = int(delta_t/pd.Timedelta("0.5S"))
time_frame_index = int(time_frame/pd.Timedelta("0.5S"))

#set the feature wants to extract
feature_genral= ["amplitude", "max_slope", "mean", "maximum", "median", "minimum", "skew", "std","flux_percentile_ratio_mid20",
                 "flux_percentile_ratio_mid35","flux_percentile_ratio_mid50","flux_percentile_ratio_mid65",
                 "flux_percentile_ratio_mid80","median_absolute_deviation", "percent_amplitude", 
                 "percent_beyond_1_std", "percent_close_to_median","percent_difference_flux_percentile",
                 "qso_log_chi2_qsonu",  "stetson_j", "stetson_k", "weighted_average"
                ]#"period_fast", "qso_log_chi2nuNULL_chi2nu", 

feature_cadence= ["avg_err", "all_times_nhist_numpeaks", "med_err"] #"std_err", "all_times_nhist_peak1_bin", "all_times_nhist_peak2_bin","cad_probs_1", "cad_probs_10"
feature_lomb_scargle =["fold2P_slope_10percentile", "fold2P_slope_90percentile", "freq1_amplitude1"]#,"freq1_amplitude2", "freq1_lambda", "freq1_signif", "linear_trend","scatter_res_raw"
features_to_use = feature_genral#+feature_cadence +feature_lomb_scargle

#export the feature data
for f in all_csv_files:
    #load the data
    data = pd.read_csv(f, sep=",", header=0,
                           parse_dates=[0], index_col=0)
    #prepare the cesium format
    x=0
    cesium_times = []
    cesium_values = []
    start_time = []
    end_time = []
    while x+time_frame_index<len(data):
        if data.index[x+time_frame_index]-data.index[x]== time_frame:
            #store the time for labeling
            start_time.append(data.index[x])
            end_time.append(data.index[x+time_frame_index])
        
            #tranform the data into cesium format
            cesium_t = data.index[x:x+time_frame_index].to_numpy()
            cesium_HR = data.HR[x:x+time_frame_index].to_numpy()
            cesium_Resp = data.Resp[x:x+time_frame_index].to_numpy()
            #store into numpy array
            cesium_values.append(np.array([cesium_HR,cesium_Resp]))
            cesium_times.append(np.array([cesium_t.astype("float"),cesium_t.astype("float")]))
        x=x+rolling_index
    if cesium_values[0].shape != (2, time_frame.total_seconds()/pd.Timedelta("0.5s").total_seconds()):
        raise Exception('Check the HR or Resp values, if there any missing values, or you forget to include one of them.')
    
    #extract the features
    fset_cesium = featurize.featurize_time_series(times = cesium_times,
                                              values = cesium_values,
                                              features_to_use=features_to_use)
    fset_cesium["start_time"]=start_time
    fset_cesium["end_time"]=end_time
    fset_cesium['label']=0
    #export the dataset
    fset_cesium.to_csv(f[:-9]+"general_fe_"+f[-9:], header=True)

  arg = dt*np.exp(-np.log(10)*ltau); ri = np.exp(-arg); ei = 1./(1./ri-ri)
  interactivity=interactivity, compiler=compiler, result=result)


## Extrac the features for LPS/Ibuprofen group

In [393]:
import os
from glob import glob
PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/10.revised_object_data_combined_vc5/lps_ibuprofen/"
EXT = "*.csv"
all_csv_files = [file
                 for path, subdir, files in os.walk(PATH)
                 for file in glob(os.path.join(path, EXT))]

In [None]:
for f in all_csv_files:
    data = pd.read_csv(f, header=0, index_col=0)
    data.index = pd.to_datetime(data.index)
    #check the timeframe consistency
    interval_wrong=0
    #check the data situation
    for i in range(len(data)-1200):
        if data.index[i+1200]-data.index[i]!=pd.Timedelta("10T"):
            interval_wrong=interval_wrong+1
            #print("For no.{} interval, starts from {}, end at {}.".format(interval_wrong,data.index[i],data.index[i+1] ))
    percent=interval_wrong/len(data)
    print("The total wrong interval for {} is {}, account for {}.".format(f[-9:-4], interval_wrong, percent))
    

In [None]:
#set the parameters
delta_t = pd.Timedelta("1T")
time_frame = pd.Timedelta("10T")

#calculate the iteration index
rolling_index = int(delta_t/pd.Timedelta("0.5S"))
time_frame_index = int(time_frame/pd.Timedelta("0.5S"))

#set the feature wants to extract
feature_genral= ["amplitude", "max_slope", "mean", "maximum", "median", "minimum", "skew", "std"]
feature_cadence= ["avg_err", "all_times_nhist_numpeaks", "med_err", "std_err", 
                  "all_times_nhist_peak1_bin", "all_times_nhist_peak2_bin",
                  "cad_probs_1", "cad_probs_10"]
feature_lomb_scargle =["fold2P_slope_10percentile", "fold2P_slope_90percentile", "freq1_amplitude1",
                      "freq1_amplitude2", "freq1_lambda", "freq1_signif", "linear_trend",
                      "scatter_res_raw"]
features_to_use = feature_genral#+feature_cadence +feature_lomb_scargle



#export the feature data
for f in all_csv_files:
    #load the data
    data = pd.read_csv(f, sep=",", header=0,
                           parse_dates=[0], index_col=0)
    #prepare the cesium format
    x=0
    cesium_times = []
    cesium_values = []
    start_time = []
    end_time = []
    while x+time_frame_index<len(data):
        if data.index[x+time_frame_index]-data.index[x]== time_frame:
            #store the time for labeling
            start_time.append(data.index[x])
            end_time.append(data.index[x+time_frame_index])
        
            #tranform the data into cesium format
            cesium_t = data.index[x:x+time_frame_index].to_numpy()
            cesium_HR = data.HR[x:x+time_frame_index].to_numpy()
            cesium_Resp = data.Resp[x:x+time_frame_index].to_numpy()
            #store into numpy array
            cesium_values.append(np.array([cesium_HR,cesium_Resp]))
            cesium_times.append(np.array([cesium_t.astype("float"),cesium_t.astype("float")]))
        x=x+rolling_index
    if cesium_values[0].shape != (2, time_frame.total_seconds()/pd.Timedelta("0.5s").total_seconds()):
        raise Exception('Check the HR or Resp values, if there any missing values, or you forget to include one of them.')
    
    #extract the features
    fset_cesium = featurize.featurize_time_series(times = cesium_times,
                                              values = cesium_values,
                                              features_to_use=features_to_use)
    fset_cesium["start_time"]=start_time
    fset_cesium["end_time"]=end_time
    fset_cesium['label']=0
    #export the dataset
    fset_cesium.to_csv(f[:-9]+"general_fe_"+f[-9:], header=True)

# labeling

In [None]:
#label for lps+saline group
import os
from glob import glob
PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/12.feature_label_4h_control/lps_ibuprofen/"
EXT = "*.csv"
all_csv_files = [file
                 for path, subdir, files in os.walk(PATH)
                 for file in glob(os.path.join(path, EXT))]


In [None]:
f= all_csv_files[8]
print(f)
data = pd.read_csv(f, sep=",", header=0,
                       parse_dates=[0], index_col=0)

In [None]:
#set the injection time
t1 = pd.to_datetime("12/2/2019 8:15:00 AM") 
t2 = pd.to_datetime("12/3/2019 9:45:00 AM") 
t3 = pd.to_datetime("12/4/2019 10:00:00 AM") 

#quick look at the data
print("The shape of data is: ", data.shape)
data["HR"].resample("10T").mean().plot(color="b",alpha=1)
(data["Resp"]*10).resample("1T").mean().plot(color="g",alpha=0.5)
plt.legend(["Heart Rate", "Respiration Rate"],
          loc="upper left")
plt.axvline(t1, alpha=0.8, color ="red")
plt.text(t1,-200,'1st injection', color ="red",rotation=90)
plt.axvline(t2, alpha=0.8, color ="red")
plt.text(t2,-200,'2nd injection', color ="red",rotation=90)
plt.axvline(t3, alpha=0.8, color ="red")
plt.text(t3,-200,'3rd injection', color ="red",rotation=90)
plt.axhline(-150, alpha=0.2, color ="black")
plt.text(data.index[0],-150,'-1.5mmHg', color ="black")

In [None]:
#calculate the baseline for 1st injection
baseline_HR_1 = data.loc[:t1].HR.mean()
print('The baseline for 1st injection is {}'.format(baseline_HR_1))
data = data.loc[t1-pd.to_timedelta("1D"):]

#find the label time for septic status #for only 1 peek
inflation = pd.DataFrame((data["HR"]/baseline_HR_1*100).resample("10T").mean())
print("The cut-off for 1st injection is {}".format(inflation[t1:t1+pd.to_timedelta("8H")].idxmax(axis=0)))

#find the label time for recovery status
injection_rec_1 = inflation[inflation[t1:t1+pd.to_timedelta("8H")].HR.idxmax(axis=0):t2]
print("The cut-off for 1st recovery is {}".format(injection_rec_1[injection_rec_1.HR<110].first_valid_index()))

In [None]:
#zoom in the experiment area
print("The shape of data is: ", data.shape)
(data["HR"]/baseline_HR_1*100).resample("10T").mean().plot(color="b",alpha=1)
plt.axvline(t1, alpha=0.8, color ="red")
plt.text(t1,85,'1st injection', color ="red",rotation=90)
plt.axvline(t2, alpha=0.8, color ="red")
plt.text(t2,85,'2nd injection', color ="red",rotation=90)
plt.axvline(t3, alpha=0.8, color ="red")
plt.text(t3,85,'3rd injection', color ="red",rotation=90)

plt.axhline(100, color ="green")
plt.axhline(110, color ="red")

#adjust the plot here:
t4 = pd.to_datetime("2019-07-08 11:10:00")
t5 = pd.to_datetime("2019-07-08 16:30:00")

plt.axvline(t4, alpha=0.8, color ="red")
plt.axvline(t5, alpha=0.8, color ="red")

In [None]:
#for those with 2 peeks
first_peek = pd.to_datetime("2019-09-16 13:40:00")+pd.to_timedelta("20T")
print(inflation[first_peek:t1+pd.to_timedelta("8H")].idxmax(axis=0))

#find the label time for recovery status
injection_rec_1 = inflation[inflation[first_peek:t1+pd.to_timedelta("8H")].HR.idxmax(axis=0):t2]
print(injection_rec_1[injection_rec_1.HR<110].first_valid_index())


In [None]:
#find the cut-off value for the 2nd injection
baseline_HR_2 = data.loc[t2-pd.to_timedelta("4H"):t2].HR.mean()
print('The baseline for 2nd injection is {}'.format(baseline_HR_2))

#find the label time for septic status
inflation = pd.DataFrame((data["HR"]/baseline_HR_2*100).resample("10T").mean())
print("The cut-off for 2nd injection is {}".format(inflation[t2:t2+pd.to_timedelta("8H")].idxmax(axis=0)))

#find the label time for recovery status
injection_rec_2 = inflation[inflation[t2:t2+pd.to_timedelta("8H")].HR.idxmax(axis=0):t3]
print("The cut-off for 2nd recovery is {}".format(injection_rec_2[injection_rec_2.HR<110].first_valid_index()))

In [None]:
#zoom in the experiment area
print("The shape of data is: ", data.shape)
(data["HR"]/baseline_HR_2*100).resample("10T").mean().plot(color="b",alpha=1)
plt.axvline(t1, alpha=0.8, color ="red")
plt.text(t1,85,'1st injection', color ="red",rotation=90)
plt.axvline(t2, alpha=0.8, color ="red")
plt.text(t2,85,'2nd injection', color ="red",rotation=90)
plt.axvline(t3, alpha=0.8, color ="red")
plt.text(t3,85,'3rd injection', color ="red",rotation=90)

plt.axhline(100, color ="green")
plt.axhline(110, color ="red")

#adjust the plot here:
t4 = pd.to_datetime("2019-07-09 11:20:00")
t5 = pd.to_datetime("2019-07-09 11:40:00")

plt.axvline(t4, alpha=0.8, color ="red")
plt.axvline(t5, alpha=0.8, color ="red")

In [None]:
#find the cut-off value for the 3rd injection
baseline_HR_3 = data.loc[t3-pd.to_timedelta("4H"):t3].HR.mean()
print('The baseline for 3rd injection is {}'.format(baseline_HR_3))

#find the label time for septic status
inflation = pd.DataFrame((data["HR"]/baseline_HR_3*100).resample("10T").mean())
print("The cut-off for 3rd injection is {}".format(inflation[t3:t3+pd.to_timedelta("8H")].idxmax(axis=0)))

#find the label time for recovery status
injection_rec_3 = inflation[inflation[t3:t3+pd.to_timedelta("8H")].HR.idxmax(axis=0):]
print("The cut-off for 3rd recovery is {}".format(injection_rec_3[injection_rec_3.HR<110].first_valid_index()))

In [None]:
#zoom in the experiment area
print("The shape of data is: ", data.shape)
(data["HR"]/baseline_HR_3*100).resample("10T").mean().plot(color="b",alpha=1)
plt.axvline(t1, alpha=0.8, color ="red")
plt.text(t1,85,'1st injection', color ="red",rotation=90)
plt.axvline(t2, alpha=0.8, color ="red")
plt.text(t2,85,'2nd injection', color ="red",rotation=90)
plt.axvline(t3, alpha=0.8, color ="red")
plt.text(t3,85,'3rd injection', color ="red",rotation=90)

plt.axhline(100, color ="green")
plt.axhline(110, color ="red")

#adjust the plot here:
t4 = pd.to_datetime("2019-07-10 15:20:00")
t5 = pd.to_datetime("2019-07-10 16:00:00")

plt.axvline(t4, alpha=0.8, color ="red")
plt.axvline(t5, alpha=0.8, color ="red")

## change the label

In [250]:
#label for lps+saline group
import os
from glob import glob
#PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/14.feature_label_intact_5min/label_fe/lps_saline_fe/"
PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/15.feature_label_intact_20min/label_fe/lps_saline_fe/"
EXT = "*.csv"
all_csv_files = [file
                 for path, subdir, files in os.walk(PATH)
                 for file in glob(os.path.join(path, EXT))]

In [257]:
f= all_csv_files[6]
print(f)
data = pd.read_csv(f, sep=",", header=0,
                       parse_dates=[0], index_col=0)
data.start_time =pd.to_datetime(data.start_time)
data.end_time =pd.to_datetime(data.end_time)
data["AnimalID"]=f[-9:-4] #add the animal ID here
print(data["label"].value_counts())
data.head()

/Users/leeo/Desktop/KI2/7.master_thesis/1.data/15.feature_label_intact_20min/label_fe/lps_saline_fe/17273.csv
0.0    8651
1.0     619
2.0     140
Name: label, dtype: int64


Unnamed: 0_level_0,amplitude,amplitude.1,max_slope,max_slope.1,mean,mean.1,maximum,maximum.1,median,median.1,minimum,minimum.1,skew,skew.1,std,std.1,start_time,end_time,label,AnimalID
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
channel,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,NaT,NaT,,17273
0,100.487495,11.831542,1.855535e-07,2.13903e-08,163.052319,0.226933,223.1421,15.21488,163.14085,0.438142,22.16711,-8.448203,-4.524016,-0.05653,12.234577,2.088825,2017-11-10 15:36:35.629100,2017-11-10 15:56:35.629100,0.0,17273
1,100.487495,11.831542,1.855535e-07,2.13903e-08,162.28201,0.447554,223.1421,15.21488,162.55265,0.553468,22.16711,-8.448203,-4.156465,0.290079,12.478502,1.840621,2017-11-10 15:37:35.629100,2017-11-10 15:57:35.629100,0.0,17273
2,100.487495,11.779478,1.855535e-07,2.13903e-08,161.625625,0.553211,223.1421,15.21488,161.89715,0.549768,22.16711,-8.344077,-3.896133,0.712706,12.535939,1.677892,2017-11-10 15:38:35.629100,2017-11-10 15:58:35.629100,0.0,17273
3,100.487495,14.980535,1.855535e-07,2.13903e-08,161.21921,0.568737,223.1421,15.21488,160.7817,0.566958,22.16711,-14.74619,-3.749656,0.30524,12.596248,1.628171,2017-11-10 15:39:35.629100,2017-11-10 15:59:35.629100,0.0,17273


In [252]:
#1st injection
injection_1 = pd.to_datetime("11/13/2017 10:30:00 AM")
septic_1    = pd.to_datetime("11/13/2017 6:10:00 PM")
recovery_1  = pd.to_datetime("11/13/2017 8:10:00 PM")
for i in range(len(data)):
    if injection_1<=data.iloc[i]["start_time"]<=septic_1:
        data.iloc[i, data.columns.get_loc('label')] = 1
    if septic_1<data.iloc[i]["start_time"]<=recovery_1:
        data.iloc[i, data.columns.get_loc('label')] = 2
        

In [253]:
#2nd injection
injection_2 = pd.to_datetime("11/14/2017 8:30:00 AM")
septic_2    = pd.to_datetime("11/14/2017 11:10:00 AM")
recovery_2  = pd.to_datetime("11/14/2017 11:30:00 AM")
for i in range(len(data)):
    if injection_2<=data.iloc[i]["start_time"]<=septic_2:
        data.iloc[i, data.columns.get_loc('label')] = 1
    if septic_2<data.iloc[i]["start_time"]<=recovery_2:
        data.iloc[i, data.columns.get_loc('label')] = 2
        

In [242]:
#3rd injection
injection_3 = pd.to_datetime("11/7/2018 9:30:00 AM")
septic_3    = pd.to_datetime("11/7/2018 2:30:00 PM")
recovery_3  = pd.to_datetime("11/7/2018 5:40:00 PM")
for i in range(len(data)):
    if injection_3<=data.iloc[i]["start_time"]<=septic_3:
        data.iloc[i, data.columns.get_loc('label')] = 1
    if septic_3<data.iloc[i]["start_time"]<=recovery_3:
        data.iloc[i, data.columns.get_loc('label')] = 2
        

In [254]:
data["label"].value_counts()

0.0    8651
1.0     619
2.0     140
Name: label, dtype: int64

In [255]:
#export the data
data.to_csv(f, header=True)
print("The data have been exported to {}.".format(f))

The data have been exported to /Users/leeo/Desktop/KI2/7.master_thesis/1.data/15.feature_label_intact_20min/label_fe/lps_saline_fe/17273.csv.


## Combine the data within the same group


In [18]:
PATH = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/16.more_feature_label_5min/control_group/"
EXT = "*.csv"
all_csv_files = [file
                 for path, subdir, files in os.walk(PATH)
                 for file in glob(os.path.join(path, EXT))]

In [19]:
combined_data = []
for f in all_csv_files:
    data = pd.read_csv(f, sep=",", header=0,
                   parse_dates=[0], index_col=0)
    #data["AnimalID"]=int(f[-9:-4])
    # store DataFrame in list
    combined_data.append(data)
combined_data = pd.concat(combined_data)
combined_data["label"].value_counts()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


0.0    107118
Name: label, dtype: int64

In [312]:
print(combined_data.columns)
combined_data.head()

Index(['amplitude', 'amplitude.1', 'max_slope', 'max_slope.1', 'mean',
       'mean.1', 'maximum', 'maximum.1', 'median', 'median.1', 'minimum',
       'minimum.1', 'skew', 'skew.1', 'std', 'std.1', 'label', 'AnimalID'],
      dtype='object')


Unnamed: 0,amplitude,amplitude.1,max_slope,max_slope.1,mean,mean.1,maximum,maximum.1,median,median.1,minimum,minimum.1,skew,skew.1,std,std.1,label,AnimalID
0,51.7823,4.311116,3.27514e-08,5.623065e-09,132.728523,1.19597,189.7301,6.008276,136.1924,1.031554,86.1655,-2.613956,0.27349,1.123438,18.330336,0.930758,0.0,18273
1,30.919325,4.311116,3.12086e-08,5.623065e-09,121.39134,0.932382,147.9414,6.008276,122.84665,0.78449,86.10275,-2.613956,-0.21371,1.333223,17.423666,0.920832,0.0,18273
2,30.304125,4.311116,3.12086e-08,5.623065e-09,114.701247,0.745753,146.711,6.008276,109.6847,0.561743,86.10275,-2.613956,0.425488,1.657374,15.383465,0.945924,0.0,18273
3,30.304125,3.722035,2.87514e-08,5.623065e-09,112.609022,0.504009,146.711,4.830114,109.55395,0.325447,86.10275,-2.613956,0.618781,1.306787,13.579618,0.750671,0.0,18273
4,40.914125,3.722035,3.79248e-08,5.623065e-09,109.906242,0.468317,167.931,4.830114,108.85435,0.296636,86.10275,-2.613956,0.845976,1.431374,10.690605,0.718383,0.0,18273


In [313]:
#combined_data = combined_data.drop(columns="start_time")
#combined_data = combined_data.drop(columns="end_time")
#combined_data = combined_data.drop("channel")
#combined_data = combined_data.drop(["channel","4104"])
df = combined_data.reset_index(drop=True)
df["label"].value_counts()

0.0    194828
1.0     10694
2.0      3935
Name: label, dtype: int64

In [314]:
#export the data
exf=PATH+"all_group.csv"
df.to_csv(exf, header=True)
print("The data have been exported to {}.".format(exf))

The data have been exported to /Users/leeo/Desktop/KI2/7.master_thesis/1.data/14.feature_label_intact_5min/label_fe/combine_fe/all_group.csv.


# Z-normalization

In [None]:
f = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/13.deal_withi_overfitting/combine_fe/testing_data.csv"
data = pd.read_csv(f, sep=",", header=0,
                   parse_dates=[0], index_col=0)
data["AnimalID"].value_counts()

In [None]:
combined_zn_data = []
for i in data["AnimalID"].value_counts().index:
    #z-normalize according to animal ID
    tmp_data = data[data["AnimalID"]==i]
    #exclude the last 2 columns: label and AnimalID
    zn_data = pd.DataFrame(stats.zscore(tmp_data.iloc[:,:-2]))
    zn_data[["label","AnimalID"]]= tmp_data.iloc[:,-2:].reset_index(drop=True)
    # store DataFrame in list
    combined_zn_data.append(zn_data)
combined_data = pd.concat(combined_zn_data)
combined_data.columns = data.columns
print(combined_data["label"].value_counts())
print(combined_data["AnimalID"].value_counts())
combined_data.head()

In [None]:
#export the data
combined_data.to_csv(f[:-4]+"_normalized"+f[-4:],header=True)

# Split the training and testing dataset

In [26]:
f = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/16.more_feature_label_5min/combine_fe/all_group.csv"
data = pd.read_csv(f, sep=",", header=0,
                   parse_dates=[0], index_col=0)
data["AnimalID"].value_counts()

19205    13768
18248    12448
19117    12089
19234    11434
18230    11397
18169    10837
19286    10697
18231    10226
18233    10077
18284     9809
18273     9806
19231     9790
19221     9780
17273     9545
18251     9435
18279     8919
19244     8726
18036     8342
19118     8268
18181     8230
18163     5834
Name: AnimalID, dtype: int64

In [27]:
#version2: saline>>17273 #ibuprofen>>19286, 19118
#version3: saline>>18231 #ibuprofen>>19221, 19231
#version4: saline>>18231 #ibuprofen>>19117, 19205
#version5: saline>>18231 #ibuprofen>>19234, 19244
#version6: saline>>18231 #ibuprofen>>19234, 19286
#version7: saline>>18231 #ibuprofen>>19234, 19231
#version8: saline>>18231 #ibuprofen>>19234, 19221
#training data:
training_data = data[data["AnimalID"]!=18231]
training_data = training_data[training_data["AnimalID"]!=19234]
training_data = training_data[training_data["AnimalID"]!=19221].reset_index(drop=True)
#export
training_data.to_csv(f[:-13]+"training_data.csv",header=True)

In [317]:
training_data["AnimalID"].value_counts()

19205    13453
18248    12208
19117    11849
18230    11114
18169    10626
19286    10488
18233     9852
18284     9599
18273     9596
19231     9565
17273     9410
18251     9169
18279     8694
19244     8681
18036     8136
19118     8118
18181     8028
18163     5699
Name: AnimalID, dtype: int64

In [28]:
#testing data:17273,19286, 19118
#testing_data = data[data["AnimalID"]==18231].reset_index(drop=True)
#testing data: combine 17273,19286, 19118
testing_data = pd.concat([data[data["AnimalID"]==18231],data[data["AnimalID"]==19234],data[data["AnimalID"]==19221]]).reset_index(drop=True)

#export
testing_data.to_csv(f[:-13]+"testing_data.csv",header=True)

# Class balancing: undersampling

In [40]:
f = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/16.more_feature_label_5min/combine_fe/training_data_binary.csv"
data = pd.read_csv(f, sep=",", header=0,
                   parse_dates=[0], index_col=0)
data["AnimalID"].value_counts()

19205    13768
18248    12448
19117    12089
18230    11397
18169    10837
19286    10697
18233    10077
18284     9809
18273     9806
19231     9790
17273     9545
18251     9435
18279     8919
19244     8726
18036     8342
19118     8268
18181     8230
18163     5834
Name: AnimalID, dtype: int64

In [41]:
under_sample=[]
for i in data["AnimalID"].value_counts().index:
    tmp = data[data["AnimalID"]==i]
    #pos_data = pd.concat([tmp[tmp["label"]==1],tmp[tmp["label"]==2]]).reset_index(drop=True)
    pos_data = tmp[tmp["label"]==1].reset_index(drop=True)
    neg_data = tmp[tmp["label"]==0].sample(n=len(pos_data), random_state=42)
    com_data = pd.concat([pos_data,neg_data]).reset_index(drop=True)
    under_sample.append(com_data)

combined_data = pd.concat(under_sample).reset_index(drop=True)

In [42]:
#export the undersampling data with only experiment groups
combined_data.to_csv(f[:-9]+"_undersample_exp"+f[-9:],header=True)

In [43]:
control_group = pd.concat([data[data["AnimalID"]==18036],data[data["AnimalID"]==18163],
                           data[data["AnimalID"]==18169],data[data["AnimalID"]==18181],
                           data[data["AnimalID"]==18230],data[data["AnimalID"]==18279]]).reset_index(drop=True)
con_sample = control_group.sample(n=combined_data["label"].value_counts()[0], random_state=42)
combined_all_data = pd.concat([con_sample,combined_data]).reset_index(drop=True)

In [44]:
#export the undersampling data with all groups
combined_all_data.to_csv(f[:-9]+"_undersample"+f[-9:],header=True)

#  Class balancing: oversampling

In [126]:
f = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/13.deal_withi_overfitting/combine_fe/training_data_binary.csv"
df = pd.read_csv(f, sep=",", header=0,
                   parse_dates=[0], index_col=0)
X=df[df.columns[:-2]]  # Features
y=df[df.columns[-2]]  # Labels

In [127]:
#Random OverSampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_sample(X, y)

In [128]:
combined_all_data = pd.concat([pd.DataFrame(X_resampled,columns=list(X.columns)),pd.DataFrame(y_resampled,columns=["label"])],axis=1).reset_index(drop=True)
#export
combined_all_data.to_csv(f[:-4]+"_RandomOverSampler"+f[-4:],header=True)

In [129]:
#SMOTE
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE(random_state=42).fit_sample(X, y)

In [130]:
combined_all_data = pd.concat([pd.DataFrame(X_resampled,columns=list(X.columns)),pd.DataFrame(y_resampled,columns=["label"])],axis=1).reset_index(drop=True)
#export
combined_all_data.to_csv(f[:-4]+"_SMOTE"+f[-4:],header=True)

In [131]:
#ADASYN
X_resampled, y_resampled = ADASYN(random_state=42).fit_sample(X, y)

In [132]:
combined_all_data = pd.concat([pd.DataFrame(X_resampled,columns=list(X.columns)),pd.DataFrame(y_resampled,columns=["label"])],axis=1).reset_index(drop=True)
#export
combined_all_data.to_csv(f[:-4]+"_ADASYN"+f[-4:],header=True)

# Convert into a binary problem

In [32]:
f = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/16.more_feature_label_5min/combine_fe/testing_data.csv"
df = pd.read_csv(f, sep=",", header=0,
                   parse_dates=[0], index_col=0)
df["label"].replace(2,1,inplace=True)

In [33]:
df.to_csv(f[:-4]+"_binary"+f[-4:],header=True)

In [34]:
df["label"].value_counts()

0.0    27150
1.0     4290
Name: label, dtype: int64

# Add coefficient of variance

In [390]:
f = "/Users/leeo/Desktop/KI2/7.master_thesis/1.data/14.feature_label_intact_5min/label_fe/combine_fe/training_data_undersample_binary.csv"
df = pd.read_csv(f, sep=",", header=0,
                   parse_dates=[0], index_col=0)

In [391]:
df["cv"] = df["std"]/df["mean"]
df["cv.1"] = df["std.1"]/df["mean.1"]
cols = df.columns.tolist()
cols = cols[-2:] + cols[:-2]
df = df[cols]

In [392]:
df

Unnamed: 0,cv,cv.1,amplitude,amplitude.1,max_slope,max_slope.1,mean,mean.1,maximum,maximum.1,median,median.1,minimum,minimum.1,skew,skew.1,std,std.1,label,AnimalID
0,0.044023,0.471308,47.621700,3.802456,1.012212e-07,5.937356e-09,146.704140,2.295727,171.2486,6.944242,146.47020,2.022219,76.005200,-0.660671,-2.003606,1.438149,6.458329,1.081994,0.0,18169
1,0.043893,-0.056794,75.457350,4.648720,1.923728e-07,8.040640e-09,166.967905,-27.908063,255.9140,-22.638100,166.74640,-27.972495,104.999300,-31.935540,4.458491,0.464340,7.328766,1.585020,0.0,18036
2,0.336655,9.446419,141.242150,3.772247,4.739434e-07,8.666257e-09,208.483222,0.073657,399.7065,3.111552,166.58645,0.171498,117.222200,-4.432941,1.199952,-1.243524,70.186824,0.695793,0.0,18169
3,0.109705,0.702875,75.382160,7.821536,2.150480e-07,1.514790e-08,155.111164,2.599290,172.4790,9.233308,156.60285,2.805340,21.714680,-6.409764,-5.317346,-1.146275,17.016463,1.826977,0.0,18230
4,0.043935,-0.107007,22.311950,4.476875,1.868501e-08,2.135521e-09,187.295825,-20.294948,203.7394,-16.031360,186.83125,-19.976750,159.115500,-24.985110,-0.587621,-0.418856,8.228843,2.171703,0.0,18036
5,0.107746,0.205244,66.088550,13.699695,1.039764e-07,4.188272e-08,187.532351,8.701286,234.7938,14.458800,191.41195,8.716655,102.616700,-12.940590,-0.210784,-2.637085,20.205868,1.785886,0.0,18169
6,0.035788,0.317706,18.261950,3.088130,2.459520e-08,3.654908e-09,154.440700,2.586340,172.1037,7.133596,154.93995,2.249029,135.579800,0.957336,-0.768623,2.394980,5.527156,0.821695,0.0,18230
7,0.037128,-0.633614,56.765500,7.145187,2.015544e-07,1.444774e-08,167.602033,-2.096889,272.1047,3.435404,166.61400,-2.158288,158.573700,-10.854970,8.603833,-0.228006,6.222722,1.328618,0.0,18163
8,0.087130,1.233758,67.735280,7.659674,1.588124e-07,1.079869e-08,155.542491,2.241662,212.7120,7.330506,156.92460,3.198564,77.241440,-7.988841,-0.317933,-0.808183,13.552355,2.765669,0.0,18230
9,0.097424,1.270557,42.809500,16.302785,5.520040e-08,3.171118e-08,158.464320,1.527204,224.2610,14.083470,155.95135,1.347025,138.642000,-18.522100,1.127937,-0.915996,15.438239,1.940401,0.0,18181


In [380]:
cols

['cv',
 'cv.1',
 'amplitude',
 'amplitude.1',
 'max_slope',
 'max_slope.1',
 'mean',
 'mean.1',
 'maximum',
 'maximum.1',
 'median',
 'median.1',
 'minimum',
 'minimum.1',
 'skew',
 'skew.1',
 'std',
 'std.1',
 'label',
 'AnimalID']