## imports

In [None]:
import neurokit2 as nk
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt 
import xgboost as xgb
import shap
import scipy.stats as stats
from datetime import datetime, timedelta
import re
import flirt
import warnings


# Specify the path to the desired directory
parent_dir = r'<<< PLACE HERE DIRECTORY WITH DATASET >>>'

# Change the current working directory to the specified directory
os.chdir(parent_dir)

mastertimesheet = pd.read_excel("mastertimesheet-4.xlsx")

# Add leading zero to p_id values below 10
mastertimesheet['p_id'] = mastertimesheet['p_id'].apply(lambda x: str(x).zfill(2))

# Verify that the working directory has been changed
print("Current working directory:", os.getcwd())

# Function to load file into a DataFrame
def load_file_into_dataframe(folder_path, var, filetype, sep=','):
    var_files = [f for f in os.listdir(folder_path) if f.endswith(filetype) and var in f]
    
    if var_files:
        file_path = os.path.join(folder_path, var_files[0])
        try:
            df = pd.read_csv(file_path, sep=sep)
            print(f"Loaded file: {file_path}")
            return df
        except pd.errors.EmptyDataError:
            print(f"The file {file_path} is empty.")
            return None
        except Exception as e:
            print(f"Error occurred while reading the file {file_path}: {e}")
            return None
    else:
        print(f"No file with '{var}' in its name found in folder {folder_path}.")
        return None

  from .autonotebook import tqdm as notebook_tqdm


Current working directory: C:\Users\BootMR\Documents\data_export


## for all participants per 30s

In [12]:
'''

Simple Summary of What the Script Does:

    Clean and Prepare Data:
        Remove bad columns: The clean_features() function is designed to remove any columns in the DataFrame that have too many missing values (more than 5%) or contain infinite values.
        Ensure numeric data: The ensure_numeric() function converts all columns to numeric types, forcing any non-numeric values (like strings) to become NaN (missing data).

    Process Each Subfolder:
        The script looks through a parent directory and processes each subfolder one by one.
        For each subfolder, it checks if the folder contains a .zip file.

    Extract Features from the Zip File:
        If a zip file is found in the subfolder, the script uses a tool (probably flirt.with_empatica) to compute features from the zip file. These features could include:
            HRV (Heart Rate Variability) features
            EDA (Electrodermal Activity) features
            Accelerometer (ACC) features
        The features are computed using windows of 30 seconds with a step size of 1 second between windows.

    Clean and Save Data:
        The script ensures that all features are numeric (using ensure_numeric()).
        It then saves the computed features to a CSV file in the same subfolder where the zip file was located, with a filename based on the subfolder name and zip file name.

    Error Handling:
        If anything goes wrong while processing a zip file, it catches the error and prints a message without crashing the entire script.

Purpose:

    The script processes zip files in subfolders, extracts time-based features (HRV, EDA, ACC), cleans the data, ensures it's numeric, and saves the results to CSV files.

'''

# Function to clean the DataFrame
def clean_features(df):
    threshold = 0.05  # 5% threshold
    columns_to_remove = [col for col in df.columns if df[col].isna().mean() > threshold or df[col].isin([float('inf'), float('-inf')]).mean() > threshold]
    if columns_to_remove:
        print(f"Removing columns due to high NaN/Inf values: {columns_to_remove}")
        df = df.drop(columns=columns_to_remove)
    return df

# Function to check and convert data to numeric
def ensure_numeric(data):
    for col in data.columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    return data

# Iterate over each subfolder in the parent directory
for subfolder_name in os.listdir(parent_dir):

    subfolder_path = os.path.join(parent_dir, subfolder_name)
    
    if os.path.isdir(subfolder_path):
        # Find the zip file in the subfolder
        zip_file = None
        for file_name in os.listdir(subfolder_path):
            if file_name.endswith('.zip'):
                zip_file = file_name
                break
        
        if zip_file:
            zip_file_path = os.path.join(subfolder_path, zip_file)
            
            try:
                # Compute features using flirt.with_empatica
                features_30s = flirt.with_.empatica(zip_file_path,
                                                   window_length=30,
                                                   window_step_size=1,
                                                   hrv_features=True,
                                                   eda_features=True,
                                                   acc_features=True)
                
                # Clean the DataFrame
                #features_30s = clean_features(features_30s)
                
                # Ensure all data is numeric
                features_30s = ensure_numeric(features_30s)
                
                # Define the file path where you want to save the CSV file
                output_csv_path = os.path.join(subfolder_path, f'features_30s_{subfolder_name}_{zip_file}.csv')
                
                # Write the DataFrame to a CSV file
                features_30s.to_csv(output_csv_path, index=True)
                print(f'Features saved to {output_csv_path}')
            
            except Exception as e:
                print(f"Error processing {zip_file_path}: {e}")


HRV features: 100%|██████████| 3689/3689 [00:48<00:00, 76.56it/s] 


Unable to remove memmapped file


EDA features: 100%|██████████| 5250/5250 [00:20<00:00, 251.92it/s]


Unable to remove memmapped file


  features_30s = flirt.with_.empatica(zip_file_path,
  features_30s = flirt.with_.empatica(zip_file_path,
ACC features: 100%|██████████| 5252/5252 [00:24<00:00, 210.61it/s]


Unable to remove memmapped file
Features saved to C:\Users\BootMR\Documents\data_export\03\features_30s_03_1716369725_a02601.zip.csv


HRV features: 100%|██████████| 4624/4624 [00:01<00:00, 3211.75it/s]


Unable to remove memmapped file


EDA features: 100%|██████████| 5043/5043 [00:23<00:00, 218.83it/s]


Unable to remove memmapped file


  features_30s = flirt.with_.empatica(zip_file_path,
ACC features: 100%|██████████| 5045/5045 [00:28<00:00, 177.28it/s]


Unable to remove memmapped file
Features saved to C:\Users\BootMR\Documents\data_export\04\features_30s_04_1714996826_a02601.zip.csv


## baseline calc 1

In [13]:
### start with computing features per minute

'''

Simple Summary of What the Script Does:

    Clean and Prepare Data:
        Remove bad columns: The clean_features() function is designed to remove any columns in the DataFrame that have too many missing values (more than 5%) or contain infinite values.
        Ensure numeric data: The ensure_numeric() function converts all columns to numeric types, forcing any non-numeric values (like strings) to become NaN (missing data).

    Process Each Subfolder:
        The script looks through a parent directory and processes each subfolder one by one.
        For each subfolder, it checks if the folder contains a .zip file.

    Extract Features from the Zip File:
        If a zip file is found in the subfolder, the script uses a tool (probably flirt.with_empatica) to compute features from the zip file. These features could include:
            HRV (Heart Rate Variability) features
            EDA (Electrodermal Activity) features
            Accelerometer (ACC) features
        The features are computed using windows of 30 seconds with a step size of 1 second between windows.

    Clean and Save Data:
        The script ensures that all features are numeric (using ensure_numeric()).
        It then saves the computed features to a CSV file in the same subfolder where the zip file was located, with a filename based on the subfolder name and zip file name.

    Error Handling:
        If anything goes wrong while processing a zip file, it catches the error and prints a message without crashing the entire script.

Purpose:

    The script processes zip files in subfolders, extracts time-based features (HRV, EDA, ACC), cleans the data, ensures it's numeric, and saves the results to CSV files.

'''

# Iterate over each subfolder in the parent directory
for subfolder_name in os.listdir(parent_dir):
    subfolder_path = os.path.join(parent_dir, subfolder_name)
    
    if os.path.isdir(subfolder_path):
        print(subfolder_path)
        # Find the zip file in the subfolder
        zip_file = None
        for file_name in os.listdir(subfolder_path):
            if file_name.endswith('.zip'):
                zip_file = file_name
                break
        
        if zip_file:
            zip_file_path = os.path.join(subfolder_path, zip_file)
            
            try:
                # Compute features using flirt.with_empatica
                features_60s = flirt.with_.empatica(zip_file_path,
                                                   window_length=60,
                                                   window_step_size=60,
                                                   hrv_features=True,
                                                   eda_features=True,
                                                   acc_features=True)
                
                # Clean the DataFrame
                #features_60s = clean_features(features_60s)
                
                # Ensure all data is numeric
                features_60s = ensure_numeric(features_60s)
                
                # Write the DataFrame to a CSV file
                output_csv_path = os.path.join(subfolder_path, f'{subfolder_name}_baseline_FlirtFeatures.csv')
                features_60s.to_csv(output_csv_path, index=True)
                print(f'Features saved to {output_csv_path}')
            
            except Exception as e:
                print(f"Error processing {zip_file_path}: {e}")


C:\Users\BootMR\Documents\data_export\00-code_export
C:\Users\BootMR\Documents\data_export\03


HRV features:   0%|          | 0/61 [00:00<?, ?it/s]

HRV features: 100%|██████████| 61/61 [00:00<00:00, 752.99it/s]


Unable to remove memmapped file


EDA features: 100%|██████████| 88/88 [00:00<00:00, 174.43it/s]
  features_60s = flirt.with_.empatica(zip_file_path,
  features_60s = flirt.with_.empatica(zip_file_path,


Unable to remove memmapped file


ACC features: 100%|██████████| 88/88 [00:00<00:00, 128.00it/s]


Unable to remove memmapped file
Features saved to C:\Users\BootMR\Documents\data_export\03\03_baseline_FlirtFeatures.csv
C:\Users\BootMR\Documents\data_export\04


HRV features: 100%|██████████| 77/77 [00:00<00:00, 936.26it/s]


Unable to remove memmapped file


EDA features: 100%|██████████| 85/85 [00:00<00:00, 123.61it/s]


Unable to remove memmapped file


  features_60s = flirt.with_.empatica(zip_file_path,
ACC features: 100%|██████████| 85/85 [00:01<00:00, 64.18it/s]


Unable to remove memmapped file
Features saved to C:\Users\BootMR\Documents\data_export\04\04_baseline_FlirtFeatures.csv


## baseline calc 2

In [14]:
# worked well 30th Jan. computes baseline descriptive stats and HRV features for all pids
#succesfully svaed 46 baseline files


skip_p_ids = [f"{i:02}" for i in range(1)]

# Iterate through each subfolder in the root folder
for p_id in os.listdir(parent_dir):

    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {folder_path} (p_id {p_id})")
        continue

    print(p_id)
    folder_path = os.path.join(parent_dir, p_id)

    if os.path.isdir(folder_path):

        # Initialize an empty DataFrame to store the results for the current p_id
        button_features = pd.DataFrame()
        
        # Load ECG and buttons data
        baseline_FlirtFeatures = load_file_into_dataframe(folder_path, 'baseline_FlirtFeatures', '.csv', ',')

        if baseline_FlirtFeatures is None:
            print(f"baseline_FlirtFeatures missing or empty in folder {folder_path}.")
            continue

        baseline_FlirtFeatures.rename(columns={baseline_FlirtFeatures.columns[0]: 'timestamp'}, inplace=True)
        baseline_FlirtFeatures['timestamp'] = pd.to_datetime(baseline_FlirtFeatures['timestamp']).dt.tz_localize(None)
        baseline_FlirtFeatures['timestamp'] = pd.to_datetime(baseline_FlirtFeatures['timestamp'])
        baseline_FlirtFeatures['timestamp'] += pd.Timedelta(hours=2)

        if baseline_FlirtFeatures is not None:
            # Convert 'timestamp' columns to datetime format
            baseline_FlirtFeatures['timestamp'] = pd.to_datetime(baseline_FlirtFeatures['timestamp'])

            ######### select baseline data by timestamps

            mask = mastertimesheet['p_id'] == p_id
            if mask.any():
                idx = mastertimesheet.index[mask][0]
                # Check if both start and end times are present in the mastertimesheet
                startt0 = mastertimesheet.loc[idx, 'startt0']
                startt1 = mastertimesheet.loc[idx, 'startt1']
                
                if pd.isna(startt0) or pd.isna(startt1):
                    print(f"Missing start or end time for p_id {p_id}. Skipping.")
                    continue
                
                # Set start and end time based on startt0 and startt1
                start_time = pd.to_datetime(startt0)
                end_time = pd.to_datetime(startt1)
            else:
                print(f"No matching entry found in mastertimesheet for p_id {p_id}")
                continue

            # Filter DataFrames based on the time range

            filtered_baseline = baseline_FlirtFeatures[(baseline_FlirtFeatures['timestamp'] >= start_time) & (baseline_FlirtFeatures['timestamp'] <= end_time)]

            averages = filtered_baseline.mean()

            # Convert the Series to a DataFrame and rename it
            flirt_baseline = pd.DataFrame(averages).T  # Transpose to keep column headers

            # Display the new dataframe
            #print(flirt_baseline)
        
            # Save the results to a CSV file named after the p_id
            output_file_path = os.path.join(folder_path, f"{p_id}_baseline_FlirtFeatures_means.csv")
            flirt_baseline.to_csv(output_file_path, index=None)
            print(f"{output_file_path} successfully SAVED for {p_id}")
        else:
            print(f"Skipping folder {folder_path} due to missing data files.")

00-code_export
No file with 'baseline_FlirtFeatures' in its name found in folder C:\Users\BootMR\Documents\data_export\00-code_export.
baseline_FlirtFeatures missing or empty in folder C:\Users\BootMR\Documents\data_export\00-code_export.
03
Loaded file: C:\Users\BootMR\Documents\data_export\03\03_baseline_FlirtFeatures.csv
C:\Users\BootMR\Documents\data_export\03\03_baseline_FlirtFeatures_means.csv successfully SAVED for 03
04
Loaded file: C:\Users\BootMR\Documents\data_export\04\04_baseline_FlirtFeatures.csv
C:\Users\BootMR\Documents\data_export\04\04_baseline_FlirtFeatures_means.csv successfully SAVED for 04
all_baseline_FlirtFeatures_means.csv
all_ratingswHRV.csv
mastertimesheet-4.xlsx
responses-full-cleaned.xlsx


## baseline correction

In [15]:
# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [18]:
## worked well 3rd Feb

skip_p_ids = [f"{i:02}" for i in range(1)]


# Iterate through each subfolder in the root folder
for p_id in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, p_id)
    
    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {folder_path} (p_id {p_id})")
        continue
    
    if os.path.isdir(folder_path):
        
        print(f"Processing folder: {folder_path}")
        
        # Initialize an empty DataFrame to store the results for the current p_id
        button_features = pd.DataFrame()
        
        # Load ECG and buttons data
        baseline_FlirtFeatures_means = load_file_into_dataframe(folder_path, 'baseline_FlirtFeatures_means', '.csv', ',')      
        flirtFeatures_30s = load_file_into_dataframe(folder_path, 'features_30s', '.csv', ',')

        if baseline_FlirtFeatures_means is None or baseline_FlirtFeatures_means.empty:
            print(f"baseline_Flirtfeatures missing or empty in folder {folder_path}.")
            continue
        
        if flirtFeatures_30s is None or flirtFeatures_30s.empty:
            print(f"flirtFeatures_30s missing or empty in folder {folder_path}.")
            continue

        # give header to 1st col

        baseline_FlirtFeatures_means.rename(columns={baseline_FlirtFeatures_means.columns[0]: 'timestamp'}, inplace=True)
        baseline_FlirtFeatures_means['timestamp'] = pd.to_datetime(baseline_FlirtFeatures_means['timestamp']).dt.tz_localize(None)

        flirtFeatures_30s.rename(columns={flirtFeatures_30s.columns[0]: 'timestamp'}, inplace=True)
        flirtFeatures_30s['timestamp'] = pd.to_datetime(flirtFeatures_30s['timestamp']).dt.tz_localize(None)
        
        if baseline_FlirtFeatures_means is not None and flirtFeatures_30s is not None:

            # Ensure all columns in baseline_Flirtfeatures exist in flirtFeatures_30s
            for col in baseline_FlirtFeatures_means.columns:
                if col not in flirtFeatures_30s.columns:
                    flirtFeatures_30s[col] = float('nan')  # Fill missing columns with Nans

            # Ignore the timestamp column (assumed to be the first column)
            flirt_columns = baseline_FlirtFeatures_means.columns[1:]

            # Convert baseline features to a NumPy array and subtract it row-wise
            baseline_values = baseline_FlirtFeatures_means.iloc[0, 1:].values  # Extract numeric row as array

            flirtFeatures_30s.loc[:, flirt_columns] = flirtFeatures_30s.loc[:, flirt_columns] - baseline_values

            # Z-standardize each HRV feature
            #flirtFeatures_30s[flirt_columns] = (flirtFeatures_30s[flirt_columns] - flirtFeatures_30s[flirt_columns].mean()) / flirtFeatures_30s[flirt_columns].std()

            # Save the concatenated DataFrame to CSV
            output_file_path = os.path.join(folder_path, f"{p_id}_flirtFeatures_30s_baselinecorrected.csv")
            flirtFeatures_30s.to_csv(output_file_path, index=None)
            print(f"{output_file_path} successfully saved for {p_id}")
        else:
            print(f"Skipping folder {folder_path} due to missing data files.")



Processing folder: C:\Users\BootMR\Documents\data_export\00-code_export
No file with 'baseline_FlirtFeatures_means' in its name found in folder C:\Users\BootMR\Documents\data_export\00-code_export.
No file with 'features_30s' in its name found in folder C:\Users\BootMR\Documents\data_export\00-code_export.
baseline_Flirtfeatures missing or empty in folder C:\Users\BootMR\Documents\data_export\00-code_export.
Processing folder: C:\Users\BootMR\Documents\data_export\03
Loaded file: C:\Users\BootMR\Documents\data_export\03\03_baseline_FlirtFeatures_means.csv
Loaded file: C:\Users\BootMR\Documents\data_export\03\features_30s_03_1716369725_a02601.zip.csv
C:\Users\BootMR\Documents\data_export\03\03_flirtFeatures_30s_baselinecorrected.csv successfully saved for 03
Processing folder: C:\Users\BootMR\Documents\data_export\04
Loaded file: C:\Users\BootMR\Documents\data_export\04\04_baseline_FlirtFeatures_means.csv
Loaded file: C:\Users\BootMR\Documents\data_export\04\features_30s_04_1714996826_a

## merge baselines into one

In [19]:

# Initialize an empty list to store DataFrames
dfs = []

# Iterate through each subfolder in the parent directory
for subdir, dirs, files in os.walk(parent_dir):
    # Check each file in the subfolder
    for file in files:
        if 'baseline_FlirtFeatures_means' in file:
            file_path = os.path.join(subdir, file)
            print(f"Processing file: {file_path}")
            
            # Load the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Append the DataFrame to the list
            dfs.append(df)

# Concatenate all DataFrames in the list
if dfs:
    all_baseline_HRVfeatures = pd.concat(dfs, ignore_index=True)
    
    # Define the output file path
    output_file_path = os.path.join(parent_dir, 'all_baseline_FlirtFeatures_means.csv')
    
    # Save the concatenated DataFrame to CSV
    all_baseline_HRVfeatures.to_csv(output_file_path, index=False)
    print(f"All files merged into: {output_file_path}")
else:
    print("No files found with 'baseline_HRVfeatures' in the filename.")

Processing file: C:\Users\BootMR\Documents\data_export\all_baseline_FlirtFeatures_means.csv
Processing file: C:\Users\BootMR\Documents\data_export\03\03_baseline_FlirtFeatures_means.csv
Processing file: C:\Users\BootMR\Documents\data_export\04\04_baseline_FlirtFeatures_means.csv
All files merged into: C:\Users\BootMR\Documents\data_export\all_baseline_FlirtFeatures_means.csv
