In [39]:

import os
import pickle
import time
import numpy as np
import pandas as pd
import neurokit2 as nk #Special library for bio-signal processing (Pip install this before running the notebook)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Point to one subject .pkl
base_dir = "C:\FILE IQBAL\KULIAH_MDSI\MDSI\SEMESTER 4\CAPSTONE PROJECT\Modelling Stage\CAPSTONE-MODEL\data\WESAD" ## replace this directory to your base directory for WESAD Folder
# subject_id = "S2"
# pkl_path = os.path.join(base_dir, subject_id, f"{subject_id}.pkl")

LABEL_MAP = {
    0: "Undefined/Transition",
    1: "Baseline",
    2: "Stress",
    3: "Amusement",
    4: "Meditation",
    5: "ignored_1",
    6: "ignored_2",
    7: "ignored_3"
}


In [40]:
# load all the data

all_data = {}
counter_rows = 0


for a in range(2, 18):
    
    if a == 12:
        continue
    
    subject = f"S{a}"
    pkl_path = os.path.join(base_dir, subject, f"{subject}.pkl")
    
    print(f"Loading data for {subject} from {pkl_path}")
    
    with open(pkl_path, "rb") as file:
        data = pickle.load(file, encoding='latin1')
        all_data[subject] = data
    
    print(f"Data for {subject} loaded. Keys: {list(data.keys())} ; Devices: {list(data['signal'].keys())}; Labels shape: {data['label'].shape}")

    print("*" * 40)
    
    counter_rows += data['label'].shape[0]
    
print(f"Total rows across all subjects (excluding S12): {counter_rows}")

Loading data for S2 from C:\FILE IQBAL\KULIAH_MDSI\MDSI\SEMESTER 4\CAPSTONE PROJECT\Modelling Stage\CAPSTONE-MODEL\data\WESAD\S2\S2.pkl
Data for S2 loaded. Keys: ['signal', 'label', 'subject'] ; Devices: ['chest', 'wrist']; Labels shape: (4255300,)
****************************************
Loading data for S3 from C:\FILE IQBAL\KULIAH_MDSI\MDSI\SEMESTER 4\CAPSTONE PROJECT\Modelling Stage\CAPSTONE-MODEL\data\WESAD\S3\S3.pkl
Data for S3 loaded. Keys: ['signal', 'label', 'subject'] ; Devices: ['chest', 'wrist']; Labels shape: (4545100,)
****************************************
Loading data for S4 from C:\FILE IQBAL\KULIAH_MDSI\MDSI\SEMESTER 4\CAPSTONE PROJECT\Modelling Stage\CAPSTONE-MODEL\data\WESAD\S4\S4.pkl
Data for S4 loaded. Keys: ['signal', 'label', 'subject'] ; Devices: ['chest', 'wrist']; Labels shape: (4496100,)
****************************************
Loading data for S5 from C:\FILE IQBAL\KULIAH_MDSI\MDSI\SEMESTER 4\CAPSTONE PROJECT\Modelling Stage\CAPSTONE-MODEL\data\WESAD\S5\S

In [41]:
# required functions to build dataframe from pickle data

def safe_get_signal(sig_entry):
    """Return np.ndarray regardless of dict/wrapper structure."""
    if isinstance(sig_entry, dict) and "signal" in sig_entry:
        return np.asarray(sig_entry["signal"])
    return np.asarray(sig_entry)

def to_series(arr, fs, name):
    t = pd.to_timedelta(np.arange(len(arr)) / fs, unit="s")
    s = pd.Series(np.asarray(arr).squeeze(), index=t, name=name)
    s.index.name = "time"
    return s

def extract_wrist_df(data) -> pd.DataFrame:
    e4 = data["signal"]["wrist"]
    parts = []
    if "EDA" in e4:
        arr = safe_get_signal(e4["EDA"]); fs = e4["EDA"]["sampling_rate"] if isinstance(e4["EDA"], dict) else 4
        parts.append(to_series(arr, fs, "W_EDA"))
    if "TEMP" in e4:
        arr = safe_get_signal(e4["TEMP"]); fs = e4["TEMP"]["sampling_rate"] if isinstance(e4["TEMP"], dict) else 4
        parts.append(to_series(arr, fs, "W_TEMP"))
    if "BVP" in e4:
        arr = safe_get_signal(e4["BVP"]); fs = e4["BVP"]["sampling_rate"] if isinstance(e4["BVP"], dict) else 64
        parts.append(to_series(arr, fs, "W_BVP"))
    if "ACC" in e4:
        arr = safe_get_signal(e4["ACC"]); fs = e4["ACC"]["sampling_rate"] if isinstance(e4["ACC"], dict) else 32
        for i, ax in enumerate(["X","Y","Z"]):
            parts.append(to_series(arr[:, i], fs, f"W_ACC_{ax}"))
    return pd.concat(parts, axis=1).sort_index()

def resample_df(df: pd.DataFrame, target_fs: float) -> pd.DataFrame:
    """Resample each column to target_fs using NeuroKit2's signal_resample.
       Keeps a TimedeltaIndex (elapsed time)."""
    if df.empty:
        return df
    # infer original fs from TimedeltaIndex
    dt = (df.index[1] - df.index[0]).total_seconds()
    orig_fs = 1.0 / dt
    target_length = int(np.round(len(df) * (target_fs / orig_fs)))
    out_cols = {}
    for col in df.columns:
        x = df[col].interpolate(limit_direction="both").to_numpy()
        y = nk.signal_resample(
            x,
            sampling_rate=orig_fs,
            desired_length=target_length,
            method="numpy"
        )
        out_cols[col] = y
    # TimedeltaIndex (elapsed time)
    t = pd.to_timedelta(np.arange(target_length) / target_fs, unit="s")
    return pd.DataFrame(out_cols, index=t).rename_axis("time")

In [42]:
# load all the data for wrist device in a dataframe

wrist_data = {}

for subject in all_data.keys():
    print(f"Processing wrist data for {subject}...")
    start_time = time.time()
    wrist_df = extract_wrist_df(all_data[subject])
    wrist_df = resample_df(wrist_df, target_fs=4.0)  # Resample to 4 Hz
    wrist_data[subject] = wrist_df
    end_time = time.time()
    print(f"Processed {wrist_df.shape[0]} rows for {subject} in {end_time - start_time:.2f} seconds.")
    print("-" * 40)

Processing wrist data for S2...
Processed 24316 rows for S2 in 0.47 seconds.
----------------------------------------
Processing wrist data for S3...
Processed 25972 rows for S3 in 0.50 seconds.
----------------------------------------
Processing wrist data for S4...
Processed 25692 rows for S4 in 0.51 seconds.
----------------------------------------
Processing wrist data for S5...
Processed 25032 rows for S5 in 0.48 seconds.
----------------------------------------
Processing wrist data for S6...
Processed 28284 rows for S6 in 0.55 seconds.
----------------------------------------
Processing wrist data for S7...
Processed 20952 rows for S7 in 0.41 seconds.
----------------------------------------
Processing wrist data for S8...
Processed 21864 rows for S8 in 0.43 seconds.
----------------------------------------
Processing wrist data for S9...
Processed 20892 rows for S9 in 0.40 seconds.
----------------------------------------
Processing wrist data for S10...
Processed 21984 rows fo

In [43]:
# get the labels for each subject

def labels_series(data, fs_guess=700):
    y = np.asarray(data["label"]).squeeze()
    s = to_series(y, fs_guess, "label_raw")
    return s.astype(int)

def resample_series(s: pd.Series, target_fs: float) -> pd.Series:
    """Resample a series to target_fs; keeps TimedeltaIndex (elapsed time)."""
    if s.empty:
        return s
    dt = (s.index[1] - s.index[0]).total_seconds()
    orig_fs = 1.0 / dt
    target_length = int(np.round(len(s) * (target_fs / orig_fs)))
    x = s.interpolate(limit_direction="both").to_numpy()
    y = nk.signal_resample(
        x,
        sampling_rate=orig_fs,
        desired_length=target_length,
        method="numpy"
    )
    t = pd.to_timedelta(np.arange(target_length) / target_fs, unit="s")
    out = pd.Series(y, index=t, name=s.name)
    out.index.name = "time"
    return out

In [44]:
# get the labels for each subject and then merge it with the wrist data in the wrist_data dictionary

for subject in all_data.keys():
    print(f"Processing labels for {subject}...")
    start_time = time.time()
    label_s = labels_series(all_data[subject], fs_guess=700)
    label_s = resample_series(label_s, target_fs=4.0)  # Resample to 4 Hz
    # Merge with wrist data
    if subject in wrist_data:
        wrist_df = wrist_data[subject]
        merged_df = wrist_df.join(label_s, how="inner")
        wrist_data[subject] = merged_df
        print(f"Merged data shape for {subject}: {merged_df.shape}")
    else:
        print(f"No wrist data found for {subject}, skipping merge.")
    end_time = time.time()
    print(f"Processed labels for {subject} in {end_time - start_time:.2f} seconds.")
    print("-" * 40)

Processing labels for S2...
Merged data shape for S2: (24306, 7)
Processed labels for S2 in 0.84 seconds.
----------------------------------------
Processing labels for S3...
Merged data shape for S3: (25962, 7)
Processed labels for S3 in 0.88 seconds.
----------------------------------------
Processing labels for S4...
Merged data shape for S4: (25682, 7)
Processed labels for S4 in 0.89 seconds.
----------------------------------------
Processing labels for S5...
Merged data shape for S5: (25022, 7)
Processed labels for S5 in 0.88 seconds.
----------------------------------------
Processing labels for S6...
Merged data shape for S6: (28273, 7)
Processed labels for S6 in 0.99 seconds.
----------------------------------------
Processing labels for S7...
Merged data shape for S7: (20944, 7)
Processed labels for S7 in 0.74 seconds.
----------------------------------------
Processing labels for S8...
Merged data shape for S8: (21855, 7)
Processed labels for S8 in 0.81 seconds.
------------

In [45]:
# extract all the dataframe from the dictionary and add the subject id as a column and concatenate them into a single dataframe

final_df = pd.concat([df.assign(subject=subject) for subject, df in wrist_data.items()], ignore_index=False)

In [46]:
# rename and convert the label raw to integer

final_df = final_df.rename(columns={"label_raw": "label"})
final_df['label'] = final_df['label'].astype(int)

In [47]:
final_df.to_csv("final_wesad_data.csv", index=True)

In [48]:
# get the statistics descriptive of the final dataframe based on the label

final_df.groupby('label').describe().T

Unnamed: 0,label,0,1,2,3,4,5,6,7
W_EDA,count,157967.0,70416.0,39850.0,22291.0,47206.0,3154.0,3158.0,3293.0
W_EDA,mean,1.794536,1.329581,3.413854,1.365635,1.398314,1.11244,2.433374,1.545004
W_EDA,std,2.23214,1.743986,3.656135,1.690812,1.68607,1.132837,2.826772,1.927473
W_EDA,min,0.05152,0.074584,0.288098,0.129681,0.142495,0.115392,0.311125,0.110275
W_EDA,25%,0.371227,0.276568,1.035374,0.320079,0.3188,0.344633,0.829307,0.306505
W_EDA,50%,0.884404,0.396544,2.135508,0.445499,0.473646,0.494831,0.949654,0.566975
W_EDA,75%,2.245584,1.576861,3.859958,1.794991,1.54284,1.538684,2.670214,2.046943
W_EDA,max,14.241592,9.165894,15.921474,8.226128,7.844654,4.111705,10.628916,8.128229
W_TEMP,count,157967.0,70416.0,39850.0,22291.0,47206.0,3154.0,3158.0,3293.0
W_TEMP,mean,32.423741,33.348293,32.641449,32.587728,32.007695,33.459204,32.217312,32.511251


In [49]:
# drop 5, 6, 7 from the label

final_df = final_df[~final_df['label'].isin([5, 6, 7,0,4])]

In [50]:
# decide the columns to be used as features and the target

feature_columns = ['W_EDA',
 'W_TEMP',
 'W_BVP',
 'W_ACC_X',
 'W_ACC_Y',
 'W_ACC_Z']
target_column = 'label'

In [51]:
# get the X and y for the model

X = final_df[feature_columns]
y = final_df[target_column]

In [52]:
# dict to store the experiment results

experiment_results = {}

In [53]:
# build a function to evaluate the model
# the evaluation metrics will be accuracy, precision, recall, f1-score, and confusion matrix
# the function will take the model, X, and y
# the function will first split the data into train, val and test set using train_test_split and stratify based on the label
# the function will also get name input to define the expeirment name for saving the model and the report
# all of the result will be stored in a dictionary

def evaluate_model(model, X, y, name):
    # get model name
    
    model_name = model.__class__.__name__
    
    # Split the data into train (70%), val (15%), and test (15%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # predict on y_train, y_val, and y_test
    
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    # calculate evaluation metrics for each of the train, val, and test set

    experiment_results[f"{name}_{model_name}"] = {
        "train_metrics": {
            "accuracy": accuracy_score(y_train, y_train_pred),
            "precision": precision_score(y_train, y_train_pred, average='weighted', zero_division=0),
            "recall": recall_score(y_train, y_train_pred, average='weighted', zero_division=0),
            "f1_score": f1_score(y_train, y_train_pred, average='weighted', zero_division=0),
            "confusion_matrix": confusion_matrix(y_train, y_train_pred).tolist()
    } ,
        "val_metrics": {
            "accuracy": accuracy_score(y_val, y_val_pred),
            "precision": precision_score(y_val, y_val_pred, average='weighted', zero_division=0),
            "recall": recall_score(y_val, y_val_pred, average='weighted', zero_division=0),
            "f1_score": f1_score(y_val, y_val_pred, average='weighted', zero_division=0),
            "confusion_matrix": confusion_matrix(y_val, y_val_pred).tolist()
    } ,
        "test_metrics": {
            "accuracy": accuracy_score(y_test, y_test_pred),
            "precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=0),
            "recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=0),
            "f1_score": f1_score(y_test, y_test_pred, average='weighted', zero_division=0),
            "confusion_matrix": confusion_matrix(y_test, y_test_pred).tolist()
    }
    }
    
    # print the confusion matrix for each of the train, val, and test set
    
    print(f"Confusion Matrix for {name} using {model_name}:")
    print("Train Set:")
    print(confusion_matrix(y_train, y_train_pred))
    print("Validation Set:")
    print(confusion_matrix(y_val, y_val_pred))
    print("Test Set:")
    print(confusion_matrix(y_test, y_test_pred))
    print("-" * 40)
    
    # print the metrics for each of the train, val, and test set
    
    print(f"Metrics for {name} using {model_name}:")
    print("Train Set:")
    print(experiment_results[f"{name}_{model_name}"]["train_metrics"])
    print("Validation Set:")
    print(experiment_results[f"{name}_{model_name}"]["val_metrics"])
    print("Test Set:")
    print(experiment_results[f"{name}_{model_name}"]["test_metrics"])

In [54]:
from lightgbm import LGBMClassifier

# create an instance of the model

lgbm_model = LGBMClassifier(random_state=42)

In [55]:
evaluate_model(lgbm_model, X, y, "EXP1_LGBM_WESAD")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1213
[LightGBM] [Info] Number of data points in the train set: 84836, number of used features: 6
[LightGBM] [Info] Start training from score -0.632592
[LightGBM] [Info] Start training from score -1.201885
[LightGBM] [Info] Start training from score -1.782841
Confusion Matrix for EXP1_LGBM_WESAD using LGBMClassifier:
Train Set:
[[45066     0     0]
 [    0 25504     0]
 [    0     0 14266]]
Validation Set:
[[11266     0     0]
 [    5  6371     0]
 [    0     1  3566]]
Test Set:
[[14083     1     0]
 [    3  7967     0]
 [    0     0  4458]]
----------------------------------------
Metrics for EXP1_LGBM_WESAD using LGBMClassifier:
Train Set:
{'accuracy': 1.0, 'precision': np.float64(1.0), 'recall': np.float64(1.0), 'f1_score': np.float64(1.0), 'confusion_matrix': [[45066, 0, 0], [0, 25504, 0], [0, 0

In [56]:
# save the results dictionary into a dataframe
list_of_results = []
for exp_index, metrics in experiment_results.items():
    for split, values in metrics.items():
        row = {"experiment": exp_index, "dataset": split}
        row.update(values)
        list_of_results.append(row)

df_results = pd.DataFrame(list_of_results)
df_results

Unnamed: 0,experiment,dataset,accuracy,precision,recall,f1_score,confusion_matrix
0,EXP1_LGBM_WESAD_LGBMClassifier,train_metrics,1.0,1.0,1.0,1.0,"[[45066, 0, 0], [0, 25504, 0], [0, 0, 14266]]"
1,EXP1_LGBM_WESAD_LGBMClassifier,val_metrics,0.999717,0.999717,0.999717,0.999717,"[[11266, 0, 0], [5, 6371, 0], [0, 1, 3566]]"
2,EXP1_LGBM_WESAD_LGBMClassifier,test_metrics,0.999849,0.999849,0.999849,0.999849,"[[14083, 1, 0], [3, 7967, 0], [0, 0, 4458]]"
