# Prepare environment

In [1]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
## Library import
import pandas as pd
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import random

from warnings import simplefilter
simplefilter('ignore')


## Model
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping()

In [3]:
## Environment setup
COLAB = True # False: local environment
PROJECT_DIR = "/content/drive/MyDrive/Project/CS565_IoT/"

In [4]:
if COLAB:
  ## Linkage Google Drive
  from google.colab import drive
  drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Set random seeds

tf.random.set_seed(1)

## Set Hyperparamenter for CNN Model

In [6]:
## Data hyperparameter ##

# Data information
original_path = PROJECT_DIR + "Data/Third collection/" # Data  path
total_csv_num = 12 # num of csv
each_emotion_csv_num = 6 # It must same count with negative.csv and positive.csv
minute = 3 # duration of recording (minute)
test_list = [5,6] # test data's name , e.g ) pos_5 is


# PPG signal Information
sampling_rate = 100 # sampling rate
chunk_num = minute*60*sampling_rate # num of ppg data per one csv
# Split data into chunk(train input)
window_sec = 10 # window size(second)
overlap_sec = 5 # overlap size(second)
total_chunk_per_csv = int((3*60-window_sec)/(window_sec-overlap_sec)+1) # The total number of chunks generated when extracting overlapping windows of a specified window size from each CSV.


## Model hyperparameter ##
cut_sample =  2*sampling_rate # using 2 seconds for prediction
class_num = 2
class_name = 'Valence'
activation_func = 'sigmoid'
loss_func = tf.keras.losses.BinaryCrossentropy(from_logits=False) # 'categorical_crossentropy'
epoch_num = 5
batch_num = 64
filter_num = 20
dense_num = 200
final_train_col = 'norm_outlier_removed_ppg' # final data for training

## Data Preprocessing

### Define Function

In [7]:
'''
Split data into input chunk
'''

def extract_data_with_overlap(audio_df_col,window_sec,overlap_sec,sampling_rate=100):
    data_list = []
    window_size = window_sec*sampling_rate  # window size
    overlap = overlap_sec*sampling_rate  # overlap size

    start = 0
    end = start + window_size

    while end <= len(audio_df_col):
        temp_list = list(audio_df_col[start:end])
        data_list.append(temp_list)
        start += window_size - overlap
        end = start + window_size

    return np.array(data_list)

In [8]:
'''
Eliminate noise from the PPG signal, which is outside the range of human heart frequency.
'''

###  Filter
'''
Use chebychef Filter
'''
order = 4  # filter's order
fs = 100  # sampling rate
lowcut = 0.8  # lowcut frequency
highcut = 3.5  # highcut frequency
rp = 1  # Maximum attenuation level of the passband(dB)
rs = 30  # Minimum attenuation level of the stopband (dB)
b, a = signal.cheby1(order, rp, [lowcut, highcut], fs=fs, btype='band')


### Outlier removal
'''
If value of |z-score| > 3.5 then, remove
'''

def outliers_modified_z(dataframe, threshold=3.5):
    data = dataframe.copy()
    median = np.median(data)
    median_absolute_deviation = np.median(np.abs(data - median))
    modified_z_scores = 0.6745 * (data - median) / median_absolute_deviation
    outliers = np.abs(modified_z_scores) > threshold
    data[outliers] = median
    return data

def test_outliers_modified_z(test_dataframe,train_dataframe_data ,threshold=3.5):
    data = test_dataframe.copy()
    median = np.median(train_dataframe_data)
    median_absolute_deviation = np.median(np.abs(data - median))
    modified_z_scores = 0.6745 * (data - median) / median_absolute_deviation
    outliers = np.abs(modified_z_scores) > threshold
    data[outliers] = median
    return data


### Normalization
'''
Apply Standard Normalization
'''
def standard_scaler(data):
    mean = np.mean(data) #
    std = np.std(data) #
    scaled_data = (data - mean) / std
    return scaled_data

def test_standard_scaler(data,audio_df_data):
    mean = np.mean(data) #
    std = np.std(data) #
    scaled_data = (data - mean) / std
    return scaled_data


### Final code
'''
apply Preprocessing to PPG
'''
def Preprocessing(dataframe):
  df = dataframe.copy()
  df['filtered_ppg'] = signal.filtfilt(b, a, df['ppg'])
  df['outlier_removed_ppg'] = outliers_modified_z(df['ppg'])
  df['filtered_and_outlier_removed_ppg'] = outliers_modified_z(df['filtered_ppg'])
  for col in df.columns:
    name = "norm_{}".format(col)
    df[name] = standard_scaler(df[col])

  return df

def Preprocessing_test(test_dataframe,train_dataframe):
  df = test_dataframe.copy()
  df['filtered_ppg'] = signal.filtfilt(b, a, df['ppg'])
  df['outlier_removed_ppg'] = test_outliers_modified_z(df['ppg'],train_dataframe['ppg'])
  df['filtered_and_outlier_removed_ppg'] = outliers_modified_z(df['filtered_ppg'])
  for col in df.columns:
    name = "norm_{}".format(col)
    df[name] =  test_standard_scaler(df[col],train_dataframe[col])

  return df

### Make train data

In [9]:
# Split Data into Train and Test data
train_list = list(range(1,each_emotion_csv_num+1,1))
for num in test_list:
  train_list.remove(num)

In [10]:
##### Train #####

audio_df = pd.DataFrame(columns=['ppg'])

for i in train_list:
  col = "audio_pos_{}".format(i)
  temp = pd.read_csv(original_path + col + '.csv')
  temp = temp.iloc[:3*60*100,1] #
  audio_df = pd.concat([audio_df,temp],axis=0)

for i in train_list:
  col = "audio_neg_{}".format(i)
  temp = pd.read_csv(original_path + col + '.csv')
  temp = temp.iloc[:3*60*100,1]
  audio_df = pd.concat([audio_df,temp],axis=0)

audio_df = audio_df[[0]]
audio_df.columns = ['ppg']

## Preprocessing
audio_df = Preprocessing(audio_df)

## Split Data into chunk
audio_total = pd.DataFrame(columns=audio_df.columns)
for col in audio_df.columns:
  idx = 0
  for i in range(0,chunk_num*(total_csv_num),chunk_num):
    audio_temp = extract_data_with_overlap(audio_df.iloc[i:(i+chunk_num),:][col].values,window_sec,overlap_sec,sampling_rate) ## 각 dataset마다 10초 간격의 데이터를 5초 ovelap하게 하나의 리스트로 뽑음

    for j in range(audio_temp.shape[0]):
      audio_total.loc[j + idx,col] = audio_temp[j]
    idx += audio_temp.shape[0]

## Attach label informatoin
audio_total['Valence'] =[0]*total_chunk_per_csv*len(train_list)+ [1]*total_chunk_per_csv*len(train_list)

## Split train and test set and random suffle
shuffled_train = audio_total.sample(frac=1, random_state=42).reset_index(drop=True)

## Train Model

In [11]:
## Evaluation Metric
from sklearn.metrics import roc_auc_score
def roc_auc(y_true, y_pred):
    auc = tf.py_function(roc_auc_score, (y_true, y_pred), tf.float32)
    return auc

### Model Structure
    - Input - 200 sample
    - Conv1
        - Conv1D, 3*1 convolutional filter, 20 feature maps, stride 1 , padding of size one
        - BN
        - Max-pooling layer with a 2*1 filter
    - Dense1
        - 200 nodes

In [12]:
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping()

deep_model = {}
for col in audio_total.columns[:-1]:

    ## train set ##
    X_train = np.stack(shuffled_train[col].values,axis=0)
    y_train = to_categorical(shuffled_train[class_name])
    X_train = X_train[:,:cut_sample]

    model = keras.models.Sequential([
      keras.layers.Conv1D(
          input_shape=(cut_sample,1), # 4 rows and 1 columns
          kernel_size=3, # the size of the window or the receptive field
          strides=1, # Sliding size
          filters=filter_num, # The number of filters
          padding="valid",
          name = 'Conv1_CNN'
      ),
      keras.layers.BatchNormalization(name='Conv1_BN'),
      keras.layers.MaxPool1D(
          pool_size=2,
          strides=2,
          name = "Conv1_MaxPool"
      ),
      keras.layers.Flatten(),
      keras.layers.Dense(dense_num, activation='relu'),
      keras.layers.Dense(class_num, activation=activation_func)
    ])

    model.compile(loss = loss_func, optimizer = 'adam', metrics = ['accuracy',"AUC"])
    model.fit(X_train, y_train, batch_size = batch_num, epochs = epoch_num, verbose = 1)
    deep_model[col] = [model]



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Conv1_CNN (Conv1D)          (None, 198, 20)           80        
                                                                 
 Conv1_BN (BatchNormalizatio  (None, 198, 20)          80        
 n)                                                              
                                                                 
 Conv1_MaxPool (MaxPooling1D  (None, 99, 20)           0         
 )                                                               
                                                                 
 flatten_7 (Flatten)         (None, 1980)              0         
                                                                 
 dense_14 (Dense)            (None, 200)               396200    
                                                                 
 dense_15 (Dense)            (None, 2)                

# Test Model

In [14]:
from sklearn.metrics import accuracy_score

for i in test_list:
  print(i,"'s result")
  audio_test = pd.DataFrame(columns=['ppg'])

  col = "audio_pos_{}".format(i)
  temp = pd.read_csv(original_path + col + '.csv')
  temp = temp.iloc[:3*60*100,1] #
  audio_test = pd.concat([audio_test,temp],axis=0)

  col = "audio_neg_{}".format(i)
  temp = pd.read_csv(original_path + col + '.csv')
  temp = temp.iloc[:3*60*100,1]
  audio_test = pd.concat([audio_test,temp],axis=0)

  audio_test = audio_test[[0]]
  audio_test.columns = ['ppg']

  audio_test = Preprocessing_test(audio_test,audio_df)

  # label
  repeat_num = int(minute*60/window_sec)
  label_list = [0]*repeat_num + [1]*repeat_num
  label = pd.DataFrame(label_list,columns=['Valence_binary'])

  for col in deep_model.keys():
    X_test_real = audio_test[col].values.reshape(-1,1000)
    y_test_real = label['Valence_binary']
    y_test_real_onehot = to_categorical(y_test_real)
    # print(X_test_real.shape,y_test_real.shape,y_test_real_onehot.shape)

    model = deep_model[col][0]
    X_test = X_test_real[:,:cut_sample]
    y_test = y_test_real_onehot

    y_predict = model.predict(X_test)

    print('##############',col,'################')
    print("acc : ", accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_predict, axis=1)))
    print("auc : ",roc_auc(y_test,y_predict))
    print("origin label : " ,np.argmax(y_test, axis=1))
    print("predict label : ",np.argmax(y_predict, axis=1))
    print()

  print()
  print()

5 's result
############## ppg ################
acc :  0.5
auc :  tf.Tensor(0.5, shape=(), dtype=float32)
origin label :  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
predict label :  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

############## filtered_ppg ################
acc :  0.3611111111111111
auc :  tf.Tensor(0.33487654, shape=(), dtype=float32)
origin label :  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
predict label :  [1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 0 1 0 0]

############## outlier_removed_ppg ################
acc :  0.5
auc :  tf.Tensor(0.5, shape=(), dtype=float32)
origin label :  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
predict label :  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

############## filtered_and_outlier_removed_ppg ################
acc :  0.3611111111111111
auc :  tf.Tensor(0.253



############## norm_ppg ################
acc :  0.3611111111111111
auc :  tf.Tensor(0.31635803, shape=(), dtype=float32)
origin label :  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
predict label :  [1 1 1 1 1 1 0 0 1 0 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 1]





############## norm_filtered_ppg ################
acc :  0.3333333333333333
auc :  tf.Tensor(0.35339507, shape=(), dtype=float32)
origin label :  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
predict label :  [1 1 1 1 1 1 1 0 1 1 0 1 0 0 1 1 0 1 1 1 0 1 0 0 0 1 0 1 0 1 1 0 0 0 0 0]

############## norm_outlier_removed_ppg ################
acc :  0.3888888888888889
auc :  tf.Tensor(0.3904321, shape=(), dtype=float32)
origin label :  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
predict label :  [1 1 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0]

############## norm_filtered_and_outlier_removed_ppg ################
acc :  0.3055555555555556
auc :  tf.Tensor(0.23765433, shape=(), dtype=float32)
origin label :  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
predict label :  [1 1 1 1 0 0 1 0 1 1 1 0 0 1 1 1 1 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0]



6 's result
############## ppg #####

In [15]:
## Results ##

'''

all is not Good..

But, we finalize the system

'''

'\n\nall is not Good..\n\nBut, we finalize the system\n\n'

# Make Train model for Arduino

## Update hyperparamaters for Arduino

In [16]:
filter_num = 5 # before: 20
dense_num = 50 # before: 200

## Prepare dataset and train a model

In [17]:
# Split Data into Train and Test data

train_list = list(range(1,each_emotion_csv_num+1,1))

##### Train #####

audio_df = pd.DataFrame(columns=['ppg'])

for i in train_list:
  col = "audio_pos_{}".format(i)
  temp = pd.read_csv(original_path + col + '.csv')
  temp = temp.iloc[:3*60*100,1] #
  audio_df = pd.concat([audio_df,temp],axis=0)

for i in train_list:
  col = "audio_neg_{}".format(i)
  temp = pd.read_csv(original_path + col + '.csv')
  temp = temp.iloc[:3*60*100,1]
  audio_df = pd.concat([audio_df,temp],axis=0)

audio_df = audio_df[[0]]
audio_df.columns = ['ppg']

## Preprocessing
audio_df = Preprocessing(audio_df)

## Split Data into chunk
audio_total = pd.DataFrame(columns=audio_df.columns)
for col in audio_df.columns:
  idx = 0
  for i in range(0,chunk_num*(total_csv_num),chunk_num):
    audio_temp = extract_data_with_overlap(audio_df.iloc[i:(i+chunk_num),:][col].values,window_sec,overlap_sec,sampling_rate) ## 각 dataset마다 10초 간격의 데이터를 5초 ovelap하게 하나의 리스트로 뽑음

    for j in range(audio_temp.shape[0]):
      audio_total.loc[j + idx,col] = audio_temp[j]
    idx += audio_temp.shape[0]

## Attach label informatoin
audio_total['Valence'] =[0]*total_chunk_per_csv*len(train_list)+ [1]*total_chunk_per_csv*len(train_list)

## Split train and test set and random suffle
shuffled_train = audio_total.sample(frac=1, random_state=42).reset_index(drop=True)



import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping()


## train set ##

X_train = np.stack(shuffled_train[final_train_col].values,axis=0)
y_train = to_categorical(shuffled_train[class_name])
X_train = X_train[:,:cut_sample]

model = keras.models.Sequential([
  keras.layers.Conv1D(
          input_shape=(cut_sample,1), # 4 rows and 1 columns
          kernel_size=3, # the size of the window or the receptive field
          strides=1, # Sliding size
          filters=filter_num, # The number of filters
          padding="valid",
          name = 'Conv1_CNN'
      ),
      keras.layers.BatchNormalization(name='Conv1_BN'),
      keras.layers.MaxPool1D(
          pool_size=2,
          strides=2,
          name = "Conv1_MaxPool"
      ),
      keras.layers.Flatten(),
      keras.layers.Dense(dense_num, activation='relu'),
      keras.layers.Dense(class_num, activation=activation_func)
    ])

model.compile(loss = loss_func, optimizer = 'adam', metrics = ['accuracy',"AUC"])
model.fit(X_train, y_train, batch_size = batch_num, epochs = epoch_num, verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb6bbeca950>

# Save the model

In [18]:
import os

MODELS_DIR = PROJECT_DIR + "Models/"
MODEL_TF = MODELS_DIR + "model.keras"

os.makedirs(MODELS_DIR, exist_ok=True)
model.save(MODEL_TF)

In [19]:
mean = np.mean(audio_df['ppg'])
std = np.std(audio_df['ppg'])
median = np.median(audio_df['ppg'])
median_absolute_deviation = np.median(np.abs(audio_df['ppg'] - median))

print("median : ", median) # for outlier removal
print("MAD : ", median_absolute_deviation)
print("mean : ", mean) #
print("standard deviation: ", std)

median :  507.0
MAD :  3.0
mean :  508.96049074074074
standard deviation:  7.95560041149137
