In [1]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt

import jax
import jax.numpy as jnp
import numpy as onp  # Only for parts not supported in jax

from scipy.signal import resample  # Used until JAX-native resampling is implemented
from scipy.ndimage import zoom

!pip install neurokit2
import neurokit2 as nk

from scipy.interpolate import interp1d

def full_resolution_hr(r_peaks, fs=700, signal_len=None, win_sec=8, step_sec=2):
    win_size = int(win_sec * fs)
    step_size = int(step_sec * fs)
    signal_len = signal_len or r_peaks[-1]

    times, hr = [], []

    for start in range(0, signal_len - win_size + 1, step_size):
        end = start + win_size
        r_win = r_peaks[(r_peaks >= start) & (r_peaks < end)]

        if len(r_win) >= 2:
            rr = jnp.diff(r_win) / fs
            bpm = 60 / rr
            hr.append(jnp.mean(bpm))
        else:
            hr.append(jnp.nan)

        times.append((start + end) / 2 / fs)

    times = jnp.array(times)
    hr = jnp.array(hr)

    valid = ~jnp.isnan(hr)
    if jnp.sum(valid) < 2:
        raise ValueError("Not enough valid heart rate points to interpolate.")

    # Interpolation fallback using numpy
    interp_fn = interp1d(onp.array(times[valid]), onp.array(hr[valid]), kind='linear', fill_value='extrapolate')
    total_samples = signal_len if isinstance(signal_len, int) else len(signal_len)
    time_axis = onp.arange(total_samples) / fs
    hr_per_sample = interp_fn(time_axis)

    return jnp.array(hr_per_sample)

def resample_to_target(signal, original_fs, target_fs, target_len):
    return jnp.array(resample(onp.array(signal), target_len))

def resample_discrete(signal, original_fs, target_fs):
    zoom_factor = target_fs / original_fs
    return jnp.array(zoom(onp.array(signal), zoom_factor, order=0))

all_data = {}
subject_list = [2,3,4,7,8,11,12,13,15]
subj = 1
for subject in subject_list:
    print(f"Processing subject {subj}...")

    BASE = '/kaggle/input/ppg-dalia-dataset/PPG_FieldStudy'
    pkl_path = os.path.join(BASE, f"S{subject}", f"S{subject}.pkl")
    with open(pkl_path, 'rb') as f:
        data = pickle.load(f, encoding='latin1')

    # Extract raw signals
    eda_signal = data['signal']['wrist']['EDA']
    resp_signal = data['signal']['chest']['Resp']
    temp_signal = data['signal']['wrist']['TEMP']
    acc_chest_signal = data['signal']['chest']['ACC']
    acc_wrist_signal = data['signal']['wrist']['ACC']
    activity_signal = data['activity']

    # Parameters
    start_time, end_time = 500, 7900
    target_fs = 1
    duration_sec = end_time - start_time
    target_length = int(duration_sec * target_fs)

    # HR
    hr_signal = full_resolution_hr(data['rpeaks'], fs=700, signal_len=6448400)
    hr_cropped = hr_signal[int(start_time * 700):int(end_time * 700)]
    hr_resampled = resample_to_target(hr_cropped, 700, target_fs, target_length)
    hr_resampled = onp.array(hr_resampled)

    # RESP
    rsp_clean = nk.rsp_clean(resp_signal, sampling_rate=700).flatten()
    rsp_rate_onsets = nk.rsp_rate(rsp_clean, sampling_rate=700, method="trough")
    rsp_avg = pd.Series(rsp_rate_onsets.flatten()).rolling(3500).mean().dropna().reset_index(drop=True)
    resp_final = jnp.array(rsp_avg)
    resp_cropped = resp_final[int(start_time * 700):int(end_time * 700)]
    resp_resampled = resample_to_target(resp_cropped, 700, target_fs, target_length)
    resp_resampled = onp.array(resp_resampled)

    # TEMP
    temp_avg = pd.Series(temp_signal.flatten()).rolling(4*25).mean().dropna().reset_index(drop=True)
    slope = onp.gradient(temp_avg.values, 1/4)
    temp_avg_final = jnp.array(temp_avg)
    temp_grad_final = jnp.array(slope)
    temp_avg_resampled = resample_to_target(temp_avg_final[int(start_time*4):int(end_time*4)], 4, target_fs, target_length)
    temp_avg_resampled = onp.array(temp_avg_resampled)
    
    temp_grad_resampled = resample_to_target(temp_grad_final[int(start_time*4):int(end_time*4)], 4, target_fs, target_length)
    temp_grad_resampled = onp.array(temp_grad_resampled)

    # ACC Chest
    mag = jnp.linalg.norm(acc_chest_signal, axis=1) - 1
    mag_ma = pd.Series(onp.array(mag)).rolling(100).mean().dropna().reset_index(drop=True)
    mad_series = mag_ma.rolling(3500).apply(lambda x: onp.mean(onp.abs(x - x.mean())), raw=True).dropna()
    acc_chest_final = jnp.array(mad_series)
    acc_chest_resampled = resample_to_target(acc_chest_final[int(start_time*700):int(end_time*700)], 700, target_fs, target_length)
    acc_chest_resampled = onp.array(acc_chest_resampled)

    # ACC Wrist
    mag = jnp.linalg.norm(acc_wrist_signal, axis=1) - 1
    mag_ma = pd.Series(onp.array(mag)).rolling(100).mean().dropna().reset_index(drop=True)
    mag_ma = onp.array(mag_ma)
    acc_wrist_final = jnp.array(mag_ma)
    acc_wrist_resampled = resample_to_target(acc_wrist_final[int(start_time*32):int(end_time*32)], 32, target_fs, target_length)
    acc_wrist_resampled = onp.array(acc_wrist_resampled)
    acc_wrist_resampled[acc_wrist_resampled < 0] = 0

    
    # Activity
    activity_signal = activity_signal.flatten()
    activity_final = jnp.array(activity_signal)
    activity_cropped = activity_final[int(start_time*4):int(end_time*4)]
    activity_resampled = resample_discrete(activity_cropped, 4, target_fs)
    activity_resampled = onp.array(activity_resampled)

    #time 
    time = onp.arange(len(activity_resampled))
    
    # Collect features
    features = {
        'hr': hr_resampled,
        'resp': resp_resampled,
        'temp_avg': temp_avg_resampled,
        'temp_grad': temp_grad_resampled,
        'acc_chest': acc_chest_resampled,
        'acc_wrist': acc_wrist_resampled,
        'activity': activity_resampled,
        'time' : time
    }

    all_data[f"S{subj}"] = {
        'info': data['questionnaire'],
        'signal': features
    }
    subj += 1 
onp.save("/kaggle/working/Data_final.npy", all_data, allow_pickle=True)

Collecting neurokit2
  Downloading neurokit2-0.2.12-py2.py3-none-any.whl.metadata (37 kB)
Downloading neurokit2-0.2.12-py2.py3-none-any.whl (708 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m708.4/708.4 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: neurokit2
Successfully installed neurokit2-0.2.12
Processing subject 1...


INFO:2025-08-02 12:25:20,306:jax._src.xla_bridge:924: Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
INFO:2025-08-02 12:25:20,322:jax._src.xla_bridge:924: Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory


Processing subject 2...
Processing subject 3...
Processing subject 4...
Processing subject 5...
Processing subject 6...
Processing subject 7...
Processing subject 8...
Processing subject 9...


In [2]:
all_data

{'S1': {'info': {'WEIGHT': 80.0,
   'Gender': ' m',
   'AGE': 28,
   'HEIGHT': 189.0,
   'SKIN': 3,
   'SPORT': 5},
  'signal': {'hr': array([68.71896 , 61.61614 , 63.691166, ..., 78.74448 , 76.83269 ,
          77.19613 ], dtype=float32),
   'resp': array([17.49154 , 17.722227, 17.86367 , ..., 18.26911 , 17.554789,
          17.252462], dtype=float32),
   'temp_avg': array([32.98664 , 31.384535, 31.840214, ..., 35.271862, 35.00394 ,
          35.46141 ], dtype=float32),
   'temp_grad': array([0.00181751, 0.00115689, 0.00145678, ..., 0.00203339, 0.00240881,
          0.00249353], dtype=float32),
   'acc_chest': array([0.00192742, 0.00106887, 0.0012672 , ..., 0.00152724, 0.00142718,
          0.00246491], dtype=float32),
   'acc_wrist': array([0.00440492, 0.        , 0.        , ..., 0.00985762, 0.00943961,
          0.01133285], dtype=float32),
   'activity': array([1., 1., 1., ..., 8., 8., 8.], dtype=float32),
   'time': array([   0,    1,    2, ..., 7397, 7398, 7399])}},
 'S2': {'inf