In [150]:

import os
import pickle
import time
import numpy as np
import pandas as pd
import neurokit2 as nk #Special library for bio-signal processing (Pip install this before running the notebook)

# Point to one subject .pkl
base_dir = "C:\FILE IQBAL\KULIAH_MDSI\MDSI\SEMESTER 4\CAPSTONE PROJECT\Modelling Stage\CAPSTONE-MODEL\data\WESAD" ## replace this directory to your base directory for WESAD Folder
# subject_id = "S2"
# pkl_path = os.path.join(base_dir, subject_id, f"{subject_id}.pkl")

LABEL_MAP = {
    0: "Undefined/Transition",
    1: "Baseline",
    2: "Stress",
    3: "Amusement",
    4: "Meditation",
    5: "ignored_1",
    6: "ignored_2",
    7: "ignored_3"
}


In [151]:
# load all the data

all_data = {}
counter_rows = 0


for a in range(2, 18):
    
    if a == 12:
        continue
    
    subject = f"S{a}"
    pkl_path = os.path.join(base_dir, subject, f"{subject}.pkl")
    
    print(f"Loading data for {subject} from {pkl_path}")
    
    with open(pkl_path, "rb") as file:
        data = pickle.load(file, encoding='latin1')
        all_data[subject] = data
    
    print(f"Data for {subject} loaded. Keys: {list(data.keys())} ; Devices: {list(data['signal'].keys())}; Labels shape: {data['label'].shape}")

    print("*" * 40)
    
    counter_rows += data['label'].shape[0]
    
print(f"Total rows across all subjects (excluding S12): {counter_rows}")

Loading data for S2 from C:\FILE IQBAL\KULIAH_MDSI\MDSI\SEMESTER 4\CAPSTONE PROJECT\Modelling Stage\CAPSTONE-MODEL\data\WESAD\S2\S2.pkl
Data for S2 loaded. Keys: ['signal', 'label', 'subject'] ; Devices: ['chest', 'wrist']; Labels shape: (4255300,)
****************************************
Loading data for S3 from C:\FILE IQBAL\KULIAH_MDSI\MDSI\SEMESTER 4\CAPSTONE PROJECT\Modelling Stage\CAPSTONE-MODEL\data\WESAD\S3\S3.pkl
Data for S3 loaded. Keys: ['signal', 'label', 'subject'] ; Devices: ['chest', 'wrist']; Labels shape: (4545100,)
****************************************
Loading data for S4 from C:\FILE IQBAL\KULIAH_MDSI\MDSI\SEMESTER 4\CAPSTONE PROJECT\Modelling Stage\CAPSTONE-MODEL\data\WESAD\S4\S4.pkl
Data for S4 loaded. Keys: ['signal', 'label', 'subject'] ; Devices: ['chest', 'wrist']; Labels shape: (4496100,)
****************************************
Loading data for S5 from C:\FILE IQBAL\KULIAH_MDSI\MDSI\SEMESTER 4\CAPSTONE PROJECT\Modelling Stage\CAPSTONE-MODEL\data\WESAD\S5\S

In [152]:
# required functions to build dataframe from pickle data

def safe_get_signal(sig_entry):
    """Return np.ndarray regardless of dict/wrapper structure."""
    if isinstance(sig_entry, dict) and "signal" in sig_entry:
        return np.asarray(sig_entry["signal"])
    return np.asarray(sig_entry)

def to_series(arr, fs, name):
    t = pd.to_timedelta(np.arange(len(arr)) / fs, unit="s")
    s = pd.Series(np.asarray(arr).squeeze(), index=t, name=name)
    s.index.name = "time"
    return s

def extract_wrist_df(data) -> pd.DataFrame:
    e4 = data["signal"]["wrist"]
    parts = []
    if "EDA" in e4:
        arr = safe_get_signal(e4["EDA"]); fs = e4["EDA"]["sampling_rate"] if isinstance(e4["EDA"], dict) else 4
        parts.append(to_series(arr, fs, "W_EDA"))
    if "TEMP" in e4:
        arr = safe_get_signal(e4["TEMP"]); fs = e4["TEMP"]["sampling_rate"] if isinstance(e4["TEMP"], dict) else 4
        parts.append(to_series(arr, fs, "W_TEMP"))
    if "BVP" in e4:
        arr = safe_get_signal(e4["BVP"]); fs = e4["BVP"]["sampling_rate"] if isinstance(e4["BVP"], dict) else 64
        parts.append(to_series(arr, fs, "W_BVP"))
    if "ACC" in e4:
        arr = safe_get_signal(e4["ACC"]); fs = e4["ACC"]["sampling_rate"] if isinstance(e4["ACC"], dict) else 32
        for i, ax in enumerate(["X","Y","Z"]):
            parts.append(to_series(arr[:, i], fs, f"W_ACC_{ax}"))
    return pd.concat(parts, axis=1).sort_index()

def resample_df(df: pd.DataFrame, target_fs: float) -> pd.DataFrame:
    """Resample each column to target_fs using NeuroKit2's signal_resample.
       Keeps a TimedeltaIndex (elapsed time)."""
    if df.empty:
        return df
    # infer original fs from TimedeltaIndex
    dt = (df.index[1] - df.index[0]).total_seconds()
    orig_fs = 1.0 / dt
    target_length = int(np.round(len(df) * (target_fs / orig_fs)))
    out_cols = {}
    for col in df.columns:
        x = df[col].interpolate(limit_direction="both").to_numpy()
        y = nk.signal_resample(
            x,
            sampling_rate=orig_fs,
            desired_length=target_length,
            method="numpy"
        )
        out_cols[col] = y
    # TimedeltaIndex (elapsed time)
    t = pd.to_timedelta(np.arange(target_length) / target_fs, unit="s")
    return pd.DataFrame(out_cols, index=t).rename_axis("time")

In [153]:
# load all the data for wrist device in a dataframe

wrist_data = {}

for subject in all_data.keys():
    print(f"Processing wrist data for {subject}...")
    start_time = time.time()
    wrist_df = extract_wrist_df(all_data[subject])
    wrist_df = resample_df(wrist_df, target_fs=4.0)  # Resample to 4 Hz
    wrist_data[subject] = wrist_df
    end_time = time.time()
    print(f"Processed {wrist_df.shape[0]} rows for {subject} in {end_time - start_time:.2f} seconds.")
    print("-" * 40)

Processing wrist data for S2...
Processed 24316 rows for S2 in 0.48 seconds.
----------------------------------------
Processing wrist data for S3...
Processed 25972 rows for S3 in 0.50 seconds.
----------------------------------------
Processing wrist data for S4...
Processed 25692 rows for S4 in 0.50 seconds.
----------------------------------------
Processing wrist data for S5...
Processed 25032 rows for S5 in 0.49 seconds.
----------------------------------------
Processing wrist data for S6...
Processed 28284 rows for S6 in 0.61 seconds.
----------------------------------------
Processing wrist data for S7...
Processed 20952 rows for S7 in 0.40 seconds.
----------------------------------------
Processing wrist data for S8...
Processed 21864 rows for S8 in 0.44 seconds.
----------------------------------------
Processing wrist data for S9...
Processed 20892 rows for S9 in 0.39 seconds.
----------------------------------------
Processing wrist data for S10...
Processed 21984 rows fo

In [154]:
# get the labels for each subject

def labels_series(data, fs_guess=700):
    y = np.asarray(data["label"]).squeeze()
    s = to_series(y, fs_guess, "label_raw")
    return s.astype(int)

def resample_series(s: pd.Series, target_fs: float) -> pd.Series:
    """Resample a series to target_fs; keeps TimedeltaIndex (elapsed time)."""
    if s.empty:
        return s
    dt = (s.index[1] - s.index[0]).total_seconds()
    orig_fs = 1.0 / dt
    target_length = int(np.round(len(s) * (target_fs / orig_fs)))
    x = s.interpolate(limit_direction="both").to_numpy()
    y = nk.signal_resample(
        x,
        sampling_rate=orig_fs,
        desired_length=target_length,
        method="numpy"
    )
    t = pd.to_timedelta(np.arange(target_length) / target_fs, unit="s")
    out = pd.Series(y, index=t, name=s.name)
    out.index.name = "time"
    return out

In [155]:
# get the labels for each subject and then merge it with the wrist data in the wrist_data dictionary

for subject in all_data.keys():
    print(f"Processing labels for {subject}...")
    start_time = time.time()
    label_s = labels_series(all_data[subject], fs_guess=700)
    label_s = resample_series(label_s, target_fs=4.0)  # Resample to 4 Hz
    # Merge with wrist data
    if subject in wrist_data:
        wrist_df = wrist_data[subject]
        merged_df = wrist_df.join(label_s, how="inner")
        wrist_data[subject] = merged_df
        print(f"Merged data shape for {subject}: {merged_df.shape}")
    else:
        print(f"No wrist data found for {subject}, skipping merge.")
    end_time = time.time()
    print(f"Processed labels for {subject} in {end_time - start_time:.2f} seconds.")
    print("-" * 40)

Processing labels for S2...
Merged data shape for S2: (24306, 7)
Processed labels for S2 in 0.84 seconds.
----------------------------------------
Processing labels for S3...
Merged data shape for S3: (25962, 7)
Processed labels for S3 in 0.88 seconds.
----------------------------------------
Processing labels for S4...
Merged data shape for S4: (25682, 7)
Processed labels for S4 in 0.90 seconds.
----------------------------------------
Processing labels for S5...
Merged data shape for S5: (25022, 7)
Processed labels for S5 in 0.86 seconds.
----------------------------------------
Processing labels for S6...
Merged data shape for S6: (28273, 7)
Processed labels for S6 in 1.02 seconds.
----------------------------------------
Processing labels for S7...
Merged data shape for S7: (20944, 7)
Processed labels for S7 in 0.74 seconds.
----------------------------------------
Processing labels for S8...
Merged data shape for S8: (21855, 7)
Processed labels for S8 in 0.76 seconds.
------------

In [156]:
# extract all the dataframe from the dictionary and add the subject id as a column and concatenate them into a single dataframe

final_df = pd.concat([df.assign(subject=subject) for subject, df in wrist_data.items()], ignore_index=False)

In [157]:
# rename and convert the label raw to integer

final_df = final_df.rename(columns={"label_raw": "label"})
final_df['label'] = final_df['label'].astype(int)

In [158]:
final_df.to_csv("final_wesad_data.csv", index=True)

In [162]:
# get the statistics descriptive of the final dataframe based on the label

final_df.groupby('label').describe().T

Unnamed: 0,label,1,2,3
W_EDA,count,70416.0,39850.0,22291.0
W_EDA,mean,1.329581,3.413854,1.365635
W_EDA,std,1.743986,3.656135,1.690812
W_EDA,min,0.074584,0.288098,0.129681
W_EDA,25%,0.276568,1.035374,0.320079
W_EDA,50%,0.396544,2.135508,0.445499
W_EDA,75%,1.576861,3.859958,1.794991
W_EDA,max,9.165894,15.921474,8.226128
W_TEMP,count,70416.0,39850.0,22291.0
W_TEMP,mean,33.348293,32.641449,32.587728


In [161]:
# drop 5, 6, 7 from the label

final_df = final_df[~final_df['label'].isin([5, 6, 7,0,4])]

In [167]:
# decide the columns to be used as features and the target

feature_columns = 'W_BVP'
target_column = 'label'

In [174]:
# get the X and y for the model

X = final_df[[feature_columns]]
y = final_df[target_column]

In [175]:
# split the data into train and test set using stratified sampling based on the label

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [176]:
# decision tree classifier model

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score

dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)

In [177]:
# evaluate the model

y_train_pred = dtc.predict(X_train)
y_val_pred = dtc.predict(X_val)

print("Training Set Evaluation:")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))
print(f"F1 Score: {f1_score(y_train, y_train_pred, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Precision: {precision_score(y_train, y_train_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_train, y_train_pred, average='weighted'):.4f}")
print("-" * 40)

print("Validation Set Evaluation:")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print(f"F1 Score: {f1_score(y_val, y_val_pred, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred, average='weighted'):.4f}")

Training Set Evaluation:
[[41133  3062   871]
 [14039 10717   748]
 [ 9439  2277  2550]]
              precision    recall  f1-score   support

           1       0.64      0.91      0.75     45066
           2       0.67      0.42      0.52     25504
           3       0.61      0.18      0.28     14266

    accuracy                           0.64     84836
   macro avg       0.64      0.50      0.51     84836
weighted avg       0.64      0.64      0.60     84836

F1 Score: 0.6000
Accuracy: 0.6412
Precision: 0.6417
Recall: 0.6412
----------------------------------------
Validation Set Evaluation:
[[8760 1932  574]
 [4490 1508  378]
 [2669  677  221]]
              precision    recall  f1-score   support

           1       0.55      0.78      0.64     11266
           2       0.37      0.24      0.29      6376
           3       0.19      0.06      0.09      3567

    accuracy                           0.49     21209
   macro avg       0.37      0.36      0.34     21209
weighted avg  

In [178]:
# try the KNN classifier model

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=100)
knn.fit(X_train, y_train)

In [179]:
# evaluate the KNN model

y_train_pred_knn = knn.predict(X_train)
y_val_pred_knn = knn.predict(X_val)

print("KNN Training Set Evaluation:")
print(confusion_matrix(y_train, y_train_pred_knn))
print(classification_report(y_train, y_train_pred_knn))
print(f"F1 Score: {f1_score(y_train, y_train_pred_knn, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred_knn):.4f}")
print(f"Precision: {precision_score(y_train, y_train_pred_knn, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_train, y_train_pred_knn, average='weighted'):.4f}")

print("-" * 40)

print("KNN Validation Set Evaluation:")
print(confusion_matrix(y_val, y_val_pred_knn))
print(classification_report(y_val, y_val_pred_knn))
print(f"F1 Score: {f1_score(y_val, y_val_pred_knn, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_knn):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred_knn, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred_knn, average='weighted'):.4f}")

KNN Training Set Evaluation:
[[40552  4344   170]
 [19757  5605   142]
 [12203  1882   181]]
              precision    recall  f1-score   support

           1       0.56      0.90      0.69     45066
           2       0.47      0.22      0.30     25504
           3       0.37      0.01      0.02     14266

    accuracy                           0.55     84836
   macro avg       0.47      0.38      0.34     84836
weighted avg       0.50      0.55      0.46     84836

F1 Score: 0.4608
Accuracy: 0.5462
Precision: 0.5012
Recall: 0.5462
----------------------------------------
KNN Validation Set Evaluation:
[[10047  1163    56]
 [ 5040  1293    43]
 [ 3049   485    33]]
              precision    recall  f1-score   support

           1       0.55      0.89      0.68     11266
           2       0.44      0.20      0.28      6376
           3       0.25      0.01      0.02      3567

    accuracy                           0.54     21209
   macro avg       0.41      0.37      0.33     212

In [180]:
# random forest classifier model

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

In [181]:
# evaluate the random forest model

y_train_pred_rfc = rfc.predict(X_train)
y_val_pred_rfc = rfc.predict(X_val)

print("Random Forest Training Set Evaluation:")
print(confusion_matrix(y_train, y_train_pred_rfc))
print(classification_report(y_train, y_train_pred_rfc))
print(f"F1 Score: {f1_score(y_train, y_train_pred_rfc, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred_rfc):.4f}")
print(f"Precision: {precision_score(y_train, y_train_pred_rfc, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_train, y_train_pred_rfc, average='weighted'):.4f}")
print("-" * 40)

print("Random Forest Validation Set Evaluation:")
print(confusion_matrix(y_val, y_val_pred_rfc))
print(classification_report(y_val, y_val_pred_rfc))
print(f"F1 Score: {f1_score(y_val, y_val_pred_rfc, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_rfc):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred_rfc, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred_rfc, average='weighted'):.4f}")


Random Forest Training Set Evaluation:
[[39226  4414  1426]
 [12390 11924  1190]
 [ 8542  2478  3246]]
              precision    recall  f1-score   support

           1       0.65      0.87      0.75     45066
           2       0.63      0.47      0.54     25504
           3       0.55      0.23      0.32     14266

    accuracy                           0.64     84836
   macro avg       0.61      0.52      0.54     84836
weighted avg       0.63      0.64      0.61     84836

F1 Score: 0.6121
Accuracy: 0.6412
Precision: 0.6300
Recall: 0.6412
----------------------------------------
Random Forest Validation Set Evaluation:
[[8190 2303  773]
 [4176 1700  500]
 [2484  784  299]]
              precision    recall  f1-score   support

           1       0.55      0.73      0.63     11266
           2       0.36      0.27      0.30      6376
           3       0.19      0.08      0.12      3567

    accuracy                           0.48     21209
   macro avg       0.37      0.36      0

In [182]:
# use LDA for modeling

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

In [183]:
# evaluate the LDA model

y_train_pred_lda = lda.predict(X_train)
y_val_pred_lda = lda.predict(X_val)

print("LDA Training Set Evaluation:")
print(confusion_matrix(y_train, y_train_pred_lda))
print(classification_report(y_train, y_train_pred_lda))
print(f"F1 Score: {f1_score(y_train, y_train_pred_lda, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred_lda):.4f}")
print(f"Precision: {precision_score(y_train, y_train_pred_lda, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_train, y_train_pred_lda, average='weighted'):.4f}")
print("-" * 40)

print("LDA Validation Set Evaluation:")
print(confusion_matrix(y_val, y_val_pred_lda))
print(classification_report(y_val, y_val_pred_lda))
print(f"F1 Score: {f1_score(y_val, y_val_pred_lda, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_lda):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred_lda, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred_lda, average='weighted'):.4f}") 

LDA Training Set Evaluation:
[[45066     0     0]
 [25504     0     0]
 [14266     0     0]]
              precision    recall  f1-score   support

           1       0.53      1.00      0.69     45066
           2       0.00      0.00      0.00     25504
           3       0.00      0.00      0.00     14266

    accuracy                           0.53     84836
   macro avg       0.18      0.33      0.23     84836
weighted avg       0.28      0.53      0.37     84836

F1 Score: 0.3686
Accuracy: 0.5312
Precision: 0.2822
Recall: 0.5312
----------------------------------------
LDA Validation Set Evaluation:
[[11266     0     0]
 [ 6376     0     0]
 [ 3567     0     0]]
              precision    recall  f1-score   support

           1       0.53      1.00      0.69     11266
           2       0.00      0.00      0.00      6376
           3       0.00      0.00      0.00      3567

    accuracy                           0.53     21209
   macro avg       0.18      0.33      0.23     212

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [184]:
# use adaboost classifier model

from sklearn.ensemble import AdaBoostClassifier

adb = AdaBoostClassifier(random_state=42)
adb.fit(X_train, y_train)



In [185]:
# evaluate the adaboost model

y_train_pred_adb = adb.predict(X_train)
y_val_pred_adb = adb.predict(X_val)

print("AdaBoost Training Set Evaluation:")
print(confusion_matrix(y_train, y_train_pred_adb))
print(classification_report(y_train, y_train_pred_adb))
print(f"F1 Score: {f1_score(y_train, y_train_pred_adb, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred_adb):.4f}")
print(f"Precision: {precision_score(y_train, y_train_pred_adb, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_train, y_train_pred_adb, average='weighted'):.4f}")
print("-" * 40)

print("AdaBoost Validation Set Evaluation:")
print(confusion_matrix(y_val, y_val_pred_adb))
print(classification_report(y_val, y_val_pred_adb))
print(f"F1 Score: {f1_score(y_val, y_val_pred_adb, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_adb):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred_adb, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred_adb, average='weighted'):.4f}")

AdaBoost Training Set Evaluation:
[[42213  2853     0]
 [21888  3616     0]
 [13610   656     0]]
              precision    recall  f1-score   support

           1       0.54      0.94      0.69     45066
           2       0.51      0.14      0.22     25504
           3       0.00      0.00      0.00     14266

    accuracy                           0.54     84836
   macro avg       0.35      0.36      0.30     84836
weighted avg       0.44      0.54      0.43     84836

F1 Score: 0.4319
Accuracy: 0.5402
Precision: 0.4411
Recall: 0.5402
----------------------------------------
AdaBoost Validation Set Evaluation:
[[10553   713     0]
 [ 5450   926     0]
 [ 3404   163     0]]
              precision    recall  f1-score   support

           1       0.54      0.94      0.69     11266
           2       0.51      0.15      0.23      6376
           3       0.00      0.00      0.00      3567

    accuracy                           0.54     21209
   macro avg       0.35      0.36      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [135]:
# use LGBM classifier model

import lightgbm as lgb
lgbm = lgb.LGBMClassifier(random_state=42)
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1282
[LightGBM] [Info] Number of data points in the train set: 216147, number of used features: 6
[LightGBM] [Info] Start training from score -0.759868
[LightGBM] [Info] Start training from score -1.567831
[LightGBM] [Info] Start training from score -2.137123
[LightGBM] [Info] Start training from score -2.718010
[LightGBM] [Info] Start training from score -1.967720


In [136]:
# evaluate the lgbm model

y_train_pred_lgbm = lgbm.predict(X_train)
y_val_pred_lgbm = lgbm.predict(X_val)

print("LGBM Training Set Evaluation:")
print(confusion_matrix(y_train, y_train_pred_lgbm))
print(classification_report(y_train, y_train_pred_lgbm))
print(f"F1 Score: {f1_score(y_train, y_train_pred_lgbm, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred_lgbm):.4f}")
print(f"Precision: {precision_score(y_train, y_train_pred_lgbm, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_train, y_train_pred_lgbm, average='weighted'):.4f}")
print("-" * 40)

print("LGBM Validation Set Evaluation:")
print(confusion_matrix(y_val, y_val_pred_lgbm))
print(classification_report(y_val, y_val_pred_lgbm))
print(f"F1 Score: {f1_score(y_val, y_val_pred_lgbm, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_lgbm):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred_lgbm, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred_lgbm, average='weighted'):.4f}")


LGBM Training Set Evaluation:
[[97782  1112   719   437  1048]
 [  906 44135     5     1    19]
 [  627    14 24863     0     0]
 [  243     5     0 14014     5]
 [  587    17     1     0 29607]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.97    101098
           1       0.97      0.98      0.98     45066
           2       0.97      0.97      0.97     25504
           3       0.97      0.98      0.98     14267
           4       0.97      0.98      0.97     30212

    accuracy                           0.97    216147
   macro avg       0.97      0.98      0.97    216147
weighted avg       0.97      0.97      0.97    216147

F1 Score: 0.9734
Accuracy: 0.9734
Precision: 0.9735
Recall: 0.9734
----------------------------------------
LGBM Validation Set Evaluation:
[[24387   304   199   128   257]
 [  270 10992     2     0     3]
 [  179     5  6192     0     0]
 [   77     1     0  3487     1]
 [  184     7     1     0  7361]]
         

In [186]:
# use XGBoost classifier model

import xgboost as xgb
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_clf.fit(X_train, y_train)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [1 2 3]

In [138]:
# evaluate the XGBoost model

y_train_pred_xgb = xgb_clf.predict(X_train)
y_val_pred_xgb = xgb_clf.predict(X_val)

print("XGBoost Training Set Evaluation:")
print(confusion_matrix(y_train, y_train_pred_xgb))
print(classification_report(y_train, y_train_pred_xgb))
print(f"F1 Score: {f1_score(y_train, y_train_pred_xgb, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred_xgb):.4f}")
print(f"Precision: {precision_score(y_train, y_train_pred_xgb, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_train, y_train_pred_xgb, average='weighted'):.4f}")

print("-" * 40)

print("XGBoost Validation Set Evaluation:")
print(confusion_matrix(y_val, y_val_pred_xgb))
print(classification_report(y_val, y_val_pred_xgb))
print(f"F1 Score: {f1_score(y_val, y_val_pred_xgb, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_xgb):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred_xgb, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred_xgb, average='weighted'):.4f}")


XGBoost Training Set Evaluation:
[[98873   771   400   341   713]
 [  532 44519     6     0     9]
 [  306     5 25193     0     0]
 [  176     4     0 14086     1]
 [  414    15     1     0 29782]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.98    101098
           1       0.98      0.99      0.99     45066
           2       0.98      0.99      0.99     25504
           3       0.98      0.99      0.98     14267
           4       0.98      0.99      0.98     30212

    accuracy                           0.98    216147
   macro avg       0.98      0.99      0.98    216147
weighted avg       0.98      0.98      0.98    216147

F1 Score: 0.9829
Accuracy: 0.9829
Precision: 0.9829
Recall: 0.9829
----------------------------------------
XGBoost Validation Set Evaluation:
[[24585   236   141   105   208]
 [  187 11075     2     0     3]
 [  129     6  6241     0     0]
 [   69     1     0  3496     0]
 [  138     3     1     0  7411]]
   

In [143]:
# try to use only W_BVP and W_EDA as features

feature_columns_reduced = ['W_BVP']
X_reduced = final_df[feature_columns_reduced]
y_reduced = final_df[target_column]

In [144]:
# split the reduced data into train and test set using stratified sampling based on the label

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_reduced, test_size=0.2, stratify=y_reduced, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [145]:
# train the lgbm model on the reduced feature set

lgbm_reduced = lgb.LGBMClassifier(random_state=42)
lgbm_reduced.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 216147, number of used features: 1
[LightGBM] [Info] Start training from score -0.759868
[LightGBM] [Info] Start training from score -1.567831
[LightGBM] [Info] Start training from score -2.137123
[LightGBM] [Info] Start training from score -2.718010
[LightGBM] [Info] Start training from score -1.967720


In [146]:
# evaluate the lgbm model

y_train_pred_lgbm = lgbm_reduced.predict(X_train)
y_val_pred_lgbm = lgbm_reduced.predict(X_val)

print("LGBM Reduced Feature Set Training Set Evaluation:")
print(confusion_matrix(y_train, y_train_pred_lgbm))
print(classification_report(y_train, y_train_pred_lgbm))
print(f"F1 Score: {f1_score(y_train, y_train_pred_lgbm, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred_lgbm):.4f}")
print(f"Precision: {precision_score(y_train, y_train_pred_lgbm, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_train, y_train_pred_lgbm, average='weighted'):.4f}")
print("-" * 40)

print("LGBM Reduced Feature Set Validation Set Evaluation:")
print(confusion_matrix(y_val, y_val_pred_lgbm))
print(classification_report(y_val, y_val_pred_lgbm))
print(f"F1 Score: {f1_score(y_val, y_val_pred_lgbm, average='weighted'):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred_lgbm):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred_lgbm, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred_lgbm, average='weighted'):.4f}")

LGBM Reduced Feature Set Training Set Evaluation:
[[101098      0      0      0      0]
 [ 45066      0      0      0      0]
 [ 25504      0      0      0      0]
 [ 14267      0      0      0      0]
 [ 30212      0      0      0      0]]
              precision    recall  f1-score   support

           0       0.47      1.00      0.64    101098
           1       0.00      0.00      0.00     45066
           2       0.00      0.00      0.00     25504
           3       0.00      0.00      0.00     14267
           4       0.00      0.00      0.00     30212

    accuracy                           0.47    216147
   macro avg       0.09      0.20      0.13    216147
weighted avg       0.22      0.47      0.30    216147

F1 Score: 0.2981
Accuracy: 0.4677
Precision: 0.2188
Recall: 0.4677
----------------------------------------
LGBM Reduced Feature Set Validation Set Evaluation:
[[25275     0     0     0     0]
 [11267     0     0     0     0]
 [ 6376     0     0     0     0]
 [ 3566    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
