<a href="https://colab.research.google.com/github/Locrian24/csc421-project-stress-classification/blob/main/csc421_project_dataset_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset Generation for Stress Detection with CNN


## Retrieving dataset

In [None]:
!wget -O wesad.zip https://uni-siegen.sciebo.de/s/HGdUkoNlW1Ub0Gx/download
!unzip wesad.zip

--2022-04-08 22:52:00--  https://uni-siegen.sciebo.de/s/HGdUkoNlW1Ub0Gx/download
Resolving uni-siegen.sciebo.de (uni-siegen.sciebo.de)... 128.176.1.2
Connecting to uni-siegen.sciebo.de (uni-siegen.sciebo.de)|128.176.1.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2249444501 (2.1G) [application/zip]
Saving to: ‘wesad.zip’


2022-04-08 22:53:17 (28.1 MB/s) - ‘wesad.zip’ saved [2249444501/2249444501]

Archive:  wesad.zip
   creating: WESAD/
   creating: WESAD/S10/
  inflating: WESAD/S10/S10.pkl       
  inflating: WESAD/S10/S10_E4_Data.zip  
  inflating: WESAD/S10/S10_quest.csv  
  inflating: WESAD/S10/S10_readme.txt  
  inflating: WESAD/S10/S10_respiban.txt  
   creating: WESAD/S11/
  inflating: WESAD/S11/S11.pkl       
  inflating: WESAD/S11/S11_E4_Data.zip  
  inflating: WESAD/S11/S11_quest.csv  
  inflating: WESAD/S11/S11_readme.txt  
  inflating: WESAD/S11/S11_respiban.txt  
   creating: WESAD/S13/
  inflating: WESAD/S13/S13.pkl       
  inflating: WESAD

In [None]:
!pip install webdataset

Collecting webdataset
  Downloading webdataset-0.2.5-py3-none-any.whl (46 kB)
[?25l[K     |███████                         | 10 kB 19.4 MB/s eta 0:00:01[K     |██████████████                  | 20 kB 11.2 MB/s eta 0:00:01[K     |█████████████████████           | 30 kB 9.3 MB/s eta 0:00:01[K     |████████████████████████████    | 40 kB 4.4 MB/s eta 0:00:01[K     |████████████████████████████████| 46 kB 2.3 MB/s 
[?25hCollecting braceexpand
  Downloading braceexpand-0.1.7-py2.py3-none-any.whl (5.9 kB)
Installing collected packages: braceexpand, webdataset
Successfully installed braceexpand-0.1.7 webdataset-0.2.5


In [None]:
# RespiBAN used for CNN data, EmpaticaE4 used for MLP-NN

In [None]:
import numpy as np
import pandas as pd
import webdataset as wds
import os

In [None]:
from scipy import stats

def generate_sample(data, i):
  keys = ['ECG', 'EDA', 'EMG', 'Temp', 'Resp', 'ACC-x', 'ACC-y', 'ACC-z']

  idx = i * 3500
  sample = np.ndarray((8, 3500))
  for j, key in enumerate(keys):
    # Accelerometer data at the end
    if key in ['ACC-x', 'ACC-y', 'ACC-z']:
      sample[j] = data['signal']['chest']['ACC'][idx:idx+3500, 0]
      sample[j+1] = data['signal']['chest']['ACC'][idx:idx+3500, 1]
      sample[j+2] = data['signal']['chest']['ACC'][idx:idx+3500, 2]
      break

    sample[j] = np.squeeze(data['signal']['chest'][key][idx:idx+3500])

  filtered = data['label'][idx:idx+3500][np.where(data['label'][idx:idx+3500] < 5)]
  
  if len(filtered) == 0:
    label = 999
  else:
    label = stats.mode(filtered)[0][0]

  return sample, label

In [None]:
all_samples = []
labels = []
dirs = next(os.walk('./WESAD'))[1]
for dir in dirs:
  data = pd.read_pickle(f'./WESAD/{dir}/{dir}.pkl')
  num_samples = data['label'].shape[0] // 3500

  for i in range(num_samples):
    sample, label = generate_sample(data, i)
    
    if label >= 5: # Ignore meaningless samples
      continue

    labels.append(label)
    all_samples.append(sample)

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(all_samples, labels, test_size=.3, train_size=.7, shuffle=True, stratify=labels)

In [None]:
train_y[0]

0

In [None]:
sink_train = wds.TarWriter("WESAD_RespiBAN_train.tar")

for i in range(len(train_y)):
  sink_train.write({
      "__key__": "sample%06d" % i,
      "input.npy": train_X[i],
      "label.cls": train_y[i]
  })

sink_train.close()

sink_test = wds.TarWriter("WESAD_RespiBAN_test.tar")
for i in range(len(test_y)):
  sink_test.write({
      "__key__": "sample%06d" % i,
      "input.npy": test_X[i],
      "label.cls": test_y[i]
  })

sink_test.close()

Run the following to copy the dataset to your Google Drive if needed

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp WESAD_RespiBAN_train.tar /content/drive/MyDrive/
!cp WESAD_RespiBAN_test.tar /content/drive/MyDrive/

In [None]:
!rm -r WESAD
!rm wesad.zip

In [None]:
dataset = wds.WebDataset("./WESAD_RespiBAN_train.tar").decode("pil").to_tuple("input.npy", "label.cls")

In [None]:
for input, label in dataset:
  print(label)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1
1
4
4
0
1
2
4
2
0
2
1
4
0
0
0
0
1
2
0
2
0
1
2
0
1
4
3
1
0
3
4
4
0
0
1
4
0
1
2
1
0
4
0
0
4
0
1
0
1
0
1
1
0
4
1
0
1
2
1
3
0
4
4
1
0
0
1
1
0
0
1
0
0
0
2
2
4
1
1
2
1
0
4
0
0
2
1
1
0
0
4
1
4
0
0
0
0
1
4
2
0
4
1
3
1
0
0
4
0
0
0
0
0
2
0
1
1
1
1
0
0
0
1
1
0
0
0
0
1
0
2
1
1
4
2
0
3
2
1
0
0
1
0
0
0
0
0
4
0
0
3
1
1
0
4
0
4
0
2
0
1
0
0
4
0
3
0
3
4
3
1
1
2
0
0
0
1
0
0
1
0
2
0
4
4
2
4
0
0
1
0
0
0
1
0
4
3
0
4
0
4
1
3
0
2
3
1
0
4
2
2
4
1
0
0
0
1
1
4
0
0
0
4
4
0
0
1
3
1
4
0
0
0
4
0
1
3
3
0
0
1
0
0
2
0
1
0
0
0
2
2
3
0
4
0
4
0
0
0
0
0
0
4
1
0
4
0
1
1
1
0
0
4
0
1
4
1
4
0
4
2
0
0
3
1
1
0
0
1
1
4
1
2
0
0
0
0
2
4
0
0
0
1
0
1
4
2
0
4
0
1
0
0
0
2
0
2
0
0
2
0
2
0
2
2
3
0
1
0
1
0
0
3
2
2
0
0
0
0
3
0
0
0
4
4
1
4
0
0
1
0
1
0
2
1
0
0
0
2
0
0
2
0
2
2
2
1
1
0
4
3
4
0
4
4
0
0
0
0
1
0
0
1
4
0
0
0
2
1
3
0
2
2
4
1
0
2
1
4
0
3
1
1
2
0
3
4
0
2
0
0
2
0
0
1
0
4
1
0
1
1
2
0
1
0
0
4
0
1
0
0
1
2
0
0
1
0
4
0
1
1
3
4
0
2
0
0
2
3
1
3
0
0
0
0
0
2
4
1
1
0
0
1
0
0
2
3