<a href="https://colab.research.google.com/github/Locrian24/csc421-project-stress-classification/blob/main/csc421_project_dataset_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset Generation for Stress Detection with CNN


## Retrieving dataset

In [1]:
!wget -O wesad.zip https://uni-siegen.sciebo.de/s/HGdUkoNlW1Ub0Gx/download
!unzip wesad.zip

--2022-04-09 02:14:10--  https://uni-siegen.sciebo.de/s/HGdUkoNlW1Ub0Gx/download
Resolving uni-siegen.sciebo.de (uni-siegen.sciebo.de)... 128.176.1.2
Connecting to uni-siegen.sciebo.de (uni-siegen.sciebo.de)|128.176.1.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2249444501 (2.1G) [application/zip]
Saving to: ‘wesad.zip’


2022-04-09 02:15:57 (20.1 MB/s) - ‘wesad.zip’ saved [2249444501/2249444501]

Archive:  wesad.zip
   creating: WESAD/
   creating: WESAD/S10/
  inflating: WESAD/S10/S10.pkl       
  inflating: WESAD/S10/S10_E4_Data.zip  
  inflating: WESAD/S10/S10_quest.csv  
  inflating: WESAD/S10/S10_readme.txt  
  inflating: WESAD/S10/S10_respiban.txt  
   creating: WESAD/S11/
  inflating: WESAD/S11/S11.pkl       
  inflating: WESAD/S11/S11_E4_Data.zip  
  inflating: WESAD/S11/S11_quest.csv  
  inflating: WESAD/S11/S11_readme.txt  
  inflating: WESAD/S11/S11_respiban.txt  
   creating: WESAD/S13/
  inflating: WESAD/S13/S13.pkl       
  inflating: WESAD

In [2]:
%%capture
!pip install webdataset

In [3]:
# RespiBAN used for CNN data, EmpaticaE4 used for MLP-NN

In [4]:
import numpy as np
import pandas as pd
import webdataset as wds
import os
from sklearn.model_selection import train_test_split

In [5]:
from scipy import stats

def generate_sample(data, i, ignore_labels=[]):
  keys = ['ECG', 'EDA', 'EMG', 'Temp', 'Resp', 'ACC-x', 'ACC-y', 'ACC-z']

  idx = i * 3500
  sample = np.ndarray((8, 3500))
  for j, key in enumerate(keys):
    # Accelerometer data at the end
    if key in ['ACC-x', 'ACC-y', 'ACC-z']:
      sample[j] = data['signal']['chest']['ACC'][idx:idx+3500, 0]
      sample[j+1] = data['signal']['chest']['ACC'][idx:idx+3500, 1]
      sample[j+2] = data['signal']['chest']['ACC'][idx:idx+3500, 2]
      break

    sample[j] = np.squeeze(data['signal']['chest'][key][idx:idx+3500])

  filtered = data['label'][idx:idx+3500][np.where(np.isin(data['label'][idx:idx+3500], ignore_labels, invert=True))]
  
  if len(filtered) == 0:
    label = 999
  else:
    label = stats.mode(filtered)[0][0]

  return sample, label

In [6]:
# Generate binary stress classification dataset

all_samples = []
labels = []
ignore_labels = [5,6,7] # Used for binary classification

dirs = next(os.walk('./WESAD'))[1]
for dir in dirs:
  data = pd.read_pickle(f'./WESAD/{dir}/{dir}.pkl')
  num_samples = data['label'].shape[0] // 3500

  for i in range(num_samples):
    sample, label = generate_sample(data, i, ignore_labels)
    
    if label in ignore_labels or label == 999: # Ignore meaningless samples
      continue

    labels.append(label)
    all_samples.append(sample)

train_X, test_X, train_y, test_y = train_test_split(all_samples, labels, test_size=.3, train_size=.7, shuffle=True, stratify=labels)

# Write WebDataset tar files
sink_train = wds.TarWriter("WESAD_RespiBAN_binary_train.tar")

for i in range(len(train_y)):
  sink_train.write({
      "__key__": "sample%06d" % i,
      "input.npy": train_X[i],
      "label.cls": train_y[i]
  })

sink_train.close()

sink_test = wds.TarWriter("WESAD_RespiBAN_binary_test.tar")
for i in range(len(test_y)):
  sink_test.write({
      "__key__": "sample%06d" % i,
      "input.npy": test_X[i],
      "label.cls": test_y[i]
  })

sink_test.close()

In [7]:
# Generate 3-class emotion classification dataset

all_samples = []
labels = []
ignore_labels = [0,4,5,6,7] # Only 1, 2, 3 useful for emotion classification

dirs = next(os.walk('./WESAD'))[1]
for dir in dirs:
  data = pd.read_pickle(f'./WESAD/{dir}/{dir}.pkl')
  num_samples = data['label'].shape[0] // 3500

  for i in range(num_samples):
    sample, label = generate_sample(data, i, ignore_labels)
    
    if label in ignore_labels or label == 999: # Ignore meaningless samples
      continue

    labels.append(label)
    all_samples.append(sample)

train_X, test_X, train_y, test_y = train_test_split(all_samples, labels, test_size=.3, train_size=.7, shuffle=True, stratify=labels)

# Write WebDataset tar files
sink_train = wds.TarWriter("WESAD_RespiBAN_multiclass_train.tar")

for i in range(len(train_y)):
  sink_train.write({
      "__key__": "sample%06d" % i,
      "input.npy": train_X[i],
      "label.cls": train_y[i]
  })

sink_train.close()

sink_test = wds.TarWriter("WESAD_RespiBAN_multiclass_test.tar")
for i in range(len(test_y)):
  sink_test.write({
      "__key__": "sample%06d" % i,
      "input.npy": test_X[i],
      "label.cls": test_y[i]
  })

sink_test.close()

Run the following to copy the dataset to your Google Drive if needed

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!cp WESAD_RespiBAN_binary_train.tar /content/drive/MyDrive/
!cp WESAD_RespiBAN_binary_test.tar /content/drive/MyDrive/
!cp WESAD_RespiBAN_multiclass_train.tar /content/drive/MyDrive/
!cp WESAD_RespiBAN_multiclass_test.tar /content/drive/MyDrive/

In [10]:
!rm -r WESAD
!rm wesad.zip

In [11]:
dataset = (
      wds.WebDataset('WESAD_RespiBAN_multiclass_train.tar')
        .decode()
        .to_tuple("input.npy", "label.cls")
)

In [12]:
i = 0
for input, label in dataset:
  if label not in [1, 2, 3]:
    i += 1
    print(label)

i

0