In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dreamer/DREAMER.mat
/kaggle/input/dreamer/dreamer/DREAMER.mat


In [5]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm

# =======================
# 1. Load dataset
# =======================
print("Loading DREAMER dataset...")
mat = loadmat("/kaggle/input/dreamer/DREAMER.mat", squeeze_me=True, struct_as_record=False)
dreamer = mat["DREAMER"]

n_subjects = int(dreamer.noOfSubjects)
n_videos = int(dreamer.noOfVideoSequences)
fs = int(dreamer.ECG_SamplingRate)

print(f"Subjects: {n_subjects}, Videos per subject: {n_videos}, ECG Sampling Rate: {fs} Hz")

# =======================
# 2. Extract ECG + Labels
# =======================
data = []

for s in tqdm(range(n_subjects), desc="Extracting ECG trials"):
    subj = dreamer.Data[s]

    ecg_baseline = subj.ECG.baseline   # shape (18,)
    ecg_stimuli  = subj.ECG.stimuli    # shape (18,)

    valence = subj.ScoreValence
    arousal = subj.ScoreArousal
    dominance = subj.ScoreDominance

    for t in range(n_videos):
        try:
            ecg_trial = np.array(ecg_stimuli[t]).flatten()

            # Skip if trial empty
            if ecg_trial.size < 100:
                continue

            # Store record
            data.append({
                "subject": s+1,
                "trial": t+1,
                "ecg": ecg_trial,
                "valence": float(valence[t]),
                "arousal": float(arousal[t]),
                "dominance": float(dominance[t])
            })

        except Exception as e:
            print(f"⚠️ Error at subject {s}, trial {t}: {e}")

df = pd.DataFrame(data)
print(f"\nFinal dataset shape: {df.shape}")
print(df.head())

# =======================
# 3. Feature Extraction
# (simple statistics per trial)
# =======================
def extract_features(signal):
    return [
        np.mean(signal),
        np.std(signal),
        np.min(signal),
        np.max(signal),
        np.median(signal),
        np.percentile(signal, 25),
        np.percentile(signal, 75)
    ]

features = []
labels = []

for _, row in df.iterrows():
    feat = extract_features(row["ecg"])
    features.append(feat)
    # Example: classify high/low valence
    labels.append(1 if row["valence"] >= 3 else 0)

X = np.array(features)
y = np.array(labels)

print("Feature matrix shape:", X.shape)
print("Labels shape:", y.shape)

# =======================
# 4. Preprocessing
# =======================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =======================
# 5. Train/Test Split
# =======================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# =======================
# 6. Train Classifier
# =======================
print("\nTraining XGBoost model...")
model = XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.05, use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)

# =======================
# 7. Evaluation
# =======================
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# =======================
# 8. Save Model + Scaler
# =======================
joblib.dump(model, "xgb_final_model.pkl")
joblib.dump(scaler, "hrv_scaler.pkl")

print("\n✅ Pipeline complete: model and scaler saved.")


Loading DREAMER dataset...
Subjects: 23, Videos per subject: 18, ECG Sampling Rate: 256 Hz


Extracting ECG trials: 100%|██████████| 23/23 [00:00<00:00, 93.71it/s]



Final dataset shape: (414, 6)
   subject  trial                                                ecg  valence  \
0        1      1  [2046, 2056, 2042, 2063, 2039, 2059, 2039, 205...      4.0   
1        1      2  [2054, 2061, 2036, 2041, 2036, 2041, 2035, 203...      3.0   
2        1      3  [2018, 2026, 2022, 2026, 2025, 2024, 2027, 202...      5.0   
3        1      4  [2055, 2051, 2052, 2051, 2053, 2054, 2054, 205...      4.0   
4        1      5  [2080, 2080, 2038, 2052, 2043, 2062, 2044, 206...      4.0   

   arousal  dominance  
0      3.0        2.0  
1      3.0        1.0  
2      4.0        4.0  
3      3.0        2.0  
4      4.0        4.0  
Feature matrix shape: (414, 7)
Labels shape: (414,)

Training XGBoost model...

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.31      0.38        32
           1       0.65      0.80      0.72        51

    accuracy                           0.61        83
   macro avg     