## imports

In [None]:
import neurokit2 as nk
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt 
import xgboost as xgb
import shap
import scipy.stats as stats
from datetime import datetime, timedelta
import re
from scipy.stats import linregress, ttest_rel, wilcoxon, zscore
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Specify the path to the desired directory
parent_dir = r'<<< PLACE HERE DIRECTORY WITH DATASET >>>'

# Change the current working directory to the specified directory
os.chdir(parent_dir)

# Function to load file into a DataFrame
def load_file_into_dataframe(folder_path, var, filetype, sep=','):
    var_files = [f for f in os.listdir(folder_path) if f.endswith(filetype) and var in f]
    
    if var_files:
        file_path = os.path.join(folder_path, var_files[0])
        try:
            df = pd.read_csv(file_path, sep=sep)
            print(f"Loaded file: {file_path}")
            return df
        except pd.errors.EmptyDataError:
            print(f"The file {file_path} is empty.")
            return None
        except Exception as e:
            print(f"Error occurred while reading the file {file_path}: {e}")
            return None
    else:
        print(f"No file with '{var}' in its name found in folder {folder_path}.")
        return None

# Verify that the working directory has been changed
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\BootMR\Documents\data_export


In [4]:
##### code to show all prints from cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Merge NeuroKit and Flirt features

In [4]:
########### adapted to new filenames
## started 3rd Feb, using script that worked on at 28th Jan
## this seems to be the correct script for merging buttons with flirt and nk features
#MB15-3: worked well when using again to merge and retain all features

''' EXPLANATION

This script merges data from two different feature sets (likely ECG-related and button press-related data) by aligning their timestamps, cleaning the data, and then saving the merged result into a new file. Here's a step-by-step explanation:
1. Setup

    Root directory: The script starts by defining a folder path where participant data is stored.
    Skip list: There's an empty list (skip_p_ids) to specify which participants should be skipped during processing.

2. Function to load files

    The function load_file_into_dataframe searches for a file that matches a given keyword (var) and file type (filetype).
    If a matching file is found, it is read into a Pandas DataFrame and returned. If the file is missing or empty, the function returns None and prints a message.

3. Timestamp cleaning functions

    remove_milliseconds: This function removes the milliseconds from a timestamp column and formats it to YYYY-MM-DD HH:MM:SS.
    remove_timezone: This function removes the timezone information from the timestamp column.

4. Processing each subfolder

    The script uses os.walk to go through each participant folder (subfolder).
    For each subfolder:
        It checks if the folder should be skipped (if it's in the skip_p_ids list).
        It looks for two specific files in the folder:
            withECGfeatures.csv (button press features with ECG).
            features_30s.csv (feature data from the FLIRT package).

5. Data Cleaning and Merging

    If both files are found:
        The script converts timestamps in both files into datetime format to ensure they can be matched.
        It shifts the timestamps in the flirt_features file by 2 hours to align them properly with the buttons_withnkfeatures file.
        It cleans the timestamps using the earlier helper functions (remove_milliseconds and remove_timezone).

6. Merging the DataFrames

    The two data files are merged based on the timestamp column.
    The merged result:
        Moves the 'rating' column to be the first column of the DataFrame.
        Reorganizes the 'timestamp' column to sit between the 'window_start_time' and 'window_end_time' columns.

7. Save the Merged Data

    The final merged DataFrame is saved as a new CSV file in the same participant folder with the naming convention p_id_buttonsFlirtNkFeatures.csv.

In Simple Terms:

    The script goes through each participant folder.
    For each participant, it looks for two specific data files (ECG/button features and FLIRT features).
    It aligns the timestamps, cleans them, merges the two datasets, and reorganizes some columns.
    Finally, it saves the merged data into a new file for each participant.

This ensures the two different data sources are synchronized and combined into a single dataset for analysis.

'''


## merge features from both packages

# List of p_ids to skip
#skip_p_ids = []  # Add any p_ids you want to skip
skip_p_ids = [f"{i:02}" for i in range(1)] # skip range

### load file function deleted

# Functions to clean timestamps
def remove_milliseconds(df, column_name):
    df[column_name] = pd.to_datetime(df[column_name]).dt.strftime('%Y-%m-%d %H:%M:%S')
    df[column_name] = pd.to_datetime(df[column_name])

def remove_timezone(df, column_name):
    df[column_name] = pd.to_datetime(df[column_name]).dt.tz_localize(None)

# Iterate through each subfolder in the root folder
for subdir, _, files in os.walk(parent_dir):
    if subdir == parent_dir:
        continue

    p_id = os.path.basename(subdir)
    
    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {subdir} (p_id {p_id})")
        continue
 
    print(f"Processing folder: {p_id}")

    # Load ECG and buttons data
    ratings_HRV_baselinecorrected = load_file_into_dataframe(subdir, 'ratings_HRV_baselinecorrected', '.csv', ',')
    flirtFeatures_30s_baselinecorrected = load_file_into_dataframe(subdir, 'flirtFeatures_30s_baselinecorrected', '.csv', ',')

    

    if ratings_HRV_baselinecorrected is None or ratings_HRV_baselinecorrected.empty:
        print(f"ratings_HRV_baselinecorrected is missing or empty in folder {subdir}.")
        continue
    
    if flirtFeatures_30s_baselinecorrected is None or flirtFeatures_30s_baselinecorrected.empty:
        print(f"flirtFeatures_30s_baselinecorrected is missing or empty in folder {subdir}.")
        continue
    
    #correct and checkt imestamps
    flirtFeatures_30s_baselinecorrected.rename(columns={'timestamp': 'timestamp_flirt', 'oldName2': 'newName2'}, inplace=True)
    flirtFeatures_30s_baselinecorrected['timestamp_flirt'] = pd.to_datetime(flirtFeatures_30s_baselinecorrected['timestamp_flirt']) + pd.Timedelta(hours=2)
    ratings_HRV_baselinecorrected['timestamp_button'] = pd.to_datetime(ratings_HRV_baselinecorrected['timestamp_button'])

    # Check and create window_start_time if missing
    if 'window_start_time' not in ratings_HRV_baselinecorrected.columns:
        ratings_HRV_baselinecorrected['window_start_time'] = ratings_HRV_baselinecorrected['timestamp_button'] - pd.Timedelta(seconds=15)
    else:
        ratings_HRV_baselinecorrected['window_start_time'] = pd.to_datetime(ratings_HRV_baselinecorrected['window_start_time'])

    # Check and create window_end_time if missing
    if 'window_end_time' not in ratings_HRV_baselinecorrected.columns:
        ratings_HRV_baselinecorrected['window_end_time'] = ratings_HRV_baselinecorrected['timestamp_button'] + pd.Timedelta(seconds=15)
    else:
        ratings_HRV_baselinecorrected['window_end_time'] = pd.to_datetime(ratings_HRV_baselinecorrected['window_end_time'])


    ### start merge

 
    if ratings_HRV_baselinecorrected is not None and flirtFeatures_30s_baselinecorrected is not None:

        merged_rows = []  # List to store the merged rows
        merged_features = [] #initialize empty

        for idx, button_row in ratings_HRV_baselinecorrected.iterrows():
            timestamp = button_row['window_start_time']

            # Find the closest row in flirtFeatures_30s_baselinecorrected by comparing timestamps
            flirtFeatures_30s_baselinecorrected['time_diff'] = (flirtFeatures_30s_baselinecorrected['timestamp_flirt'] - timestamp).abs()

            # Find the row with the smallest time difference
            closest_row = flirtFeatures_30s_baselinecorrected.loc[flirtFeatures_30s_baselinecorrected['time_diff'].idxmin()]

            # Reset index to avoid merging conflicts
            button_row = button_row.to_frame().T.reset_index(drop=True)
            closest_row = closest_row.to_frame().T.reset_index(drop=True)

            # Merge rows properly (ensure both rows are 2D DataFrames)
            merged_row = pd.concat([button_row, closest_row], axis=1)

            # Calculate the time difference (timediff_flirtnk) in a new column
            merged_row['timediff_flirtnk'] = (merged_row['window_start_time'] - merged_row['timestamp_flirt']).abs()

            # Append the merged row to the list
            merged_rows.append(merged_row)

        # Convert the list of merged rows into a DataFrame (ensure it's 2D)
        merged_features = pd.concat(merged_rows, ignore_index=True)

        ''' old cleaning code, now moved to extendDataset
        #### clean columns


        # Optional: Drop 'time_diff' column if you don't need it
        merged_features.drop(columns=['time_diff'], errors='ignore', inplace=True)

        # Get the list of columns
        cols = list(merged_features.columns)

        # Find the index positions
        timediff_idx = cols.index('timediff_flirtnk')
        hr_mean_idx = cols.index('hr_mean')

        # Remove 'timestamp_flirt' from its current position
        cols.remove('timestamp_flirt')

        # Insert 'timestamp_flirt' in the correct position (right after 'timediff_flirtnk')
        cols.insert(timediff_idx + 1, 'timestamp_flirt')

        # Reorder the DataFrame
        merged_features = merged_features[cols]

        # Rename the column
        merged_features.rename(columns={'time_difference': 'timediff_buttonloc'}, inplace=True)

        # Reorder columns to place 'timediff_buttonloc' between 'window_end_time' and 'timediff_flirtnk'
        columns = list(merged_features.columns)
        col_idx_window_end_time = columns.index('window_end_time')
        col_idx_timediff_flirtnk = columns.index('timediff_flirtnk')

        # Insert 'timediff_buttonloc' in the correct position
        columns.insert(col_idx_timediff_flirtnk, columns.pop(columns.index('timediff_buttonloc')))
        merged_features = merged_features[columns]

        # Drop the unwanted columns
        merged_features.drop(columns=['hr_mean.1', 'hr_stdev.1', 'hrv_mean.1', 'hrv_stdev.1'], errors='ignore', inplace=True)

        # List of columns to rename
        columns_to_rename = [
            "hr_mean", "hr_stdev", "hrv_mean", "hrv_mean", "hrv_stdev", "HRV_MeanNN", "HRV_SDNN", "HRV_SDANN1", 
            "HRV_SDNNI1", "HRV_SDANN2", "HRV_SDNNI2", "HRV_SDANN5", "HRV_SDNNI5", "HRV_RMSSD", "HRV_SDSD", 
            "HRV_CVNN", "HRV_CVSD", "HRV_MedianNN", "HRV_MadNN", "HRV_MCVNN", "HRV_IQRNN", "HRV_SDRMSSD", 
            "HRV_Prc20NN", "HRV_Prc80NN", "HRV_pNN50", "HRV_pNN20", "HRV_MinNN", "HRV_MaxNN", "HRV_HTI", 
            "HRV_TINN", "HRV_ULF", "HRV_VLF", "HRV_LF", "HRV_HF", "HRV_VHF", "HRV_TP", "HRV_LFHF", "HRV_LFn", 
            "HRV_HFn", "HRV_LnHF", "HRV_SD1", "HRV_SD2", "HRV_SD1SD2", "HRV_S", "HRV_CSI", "HRV_CVI", 
            "HRV_CSI_Modified", "HRV_PIP", "HRV_IALS", "HRV_PSS", "HRV_PAS", "HRV_GI", "HRV_SI", "HRV_AI", 
            "HRV_PI", "HRV_C1d", "HRV_C1a", "HRV_SD1d", "HRV_SD1a", "HRV_C2d", "HRV_C2a", "HRV_SD2d", 
            "HRV_SD2a", "HRV_Cd", "HRV_Ca", "HRV_SDNNd", "HRV_SDNNa", "HRV_DFA_alpha1", "HRV_MFDFA_alpha1_Width", 
            "HRV_MFDFA_alpha1_Peak", "HRV_MFDFA_alpha1_Mean", "HRV_MFDFA_alpha1_Max", "HRV_MFDFA_alpha1_Delta", 
            "HRV_MFDFA_alpha1_Asymmetry", "HRV_MFDFA_alpha1_Fluctuation", "HRV_MFDFA_alpha1_Increment", 
            "HRV_ApEn", "HRV_SampEn", "HRV_ShanEn", "HRV_FuzzyEn", "HRV_MSEn", "HRV_CMSEn", "HRV_RCMSEn", 
            "HRV_CD", "HRV_HFD", "HRV_KFD", "HRV_LZC"
        ]

        # Create a dictionary mapping old names to new names with "H10_" prefix
        rename_dict = {col: f"H10_{col}" for col in columns_to_rename}

        # Rename columns
        merged_features.rename(columns=rename_dict, inplace=True)

        # List of columns to rename with "E4_" prefix
        columns_to_rename_e4 = [
            "num_ibis", "hrv_mean_nni", "hrv_median_nni", "hrv_range_nni", "hrv_sdsd", "hrv_rmssd", "hrv_nni_50", 
            "hrv_pnni_50", "hrv_nni_20", "hrv_pnni_20", "hrv_cvsd", "hrv_sdnn", "hrv_cvnni", "hrv_mean_hr", 
            "hrv_min_hr", "hrv_max_hr", "hrv_std_hr", "hrv_total_power", "hrv_vlf", "hrv_lf", "hrv_hf", 
            "hrv_lf_hf_ratio", "hrv_lfnu", "hrv_hfnu", "hrv_mean", "hrv_mean", "hrv_std", "hrv_min", "hrv_max", 
            "hrv_ptp", "hrv_sum", "hrv_energy", "hrv_skewness", "hrv_kurtosis", "hrv_peaks", "hrv_rms", 
            "hrv_lineintegral", "hrv_n_above_mean", "hrv_n_below_mean", "hrv_n_sign_changes", "hrv_iqr", 
            "hrv_iqr_5_95", "hrv_pct_5", "hrv_pct_95", "hrv_entropy", "hrv_perm_entropy", "hrv_svd_entropy", 
            "eda_tonic_mean", "eda_tonic_std", "eda_tonic_min", "eda_tonic_max", "eda_tonic_ptp", "eda_tonic_sum", 
            "eda_tonic_energy", "eda_tonic_skewness", "eda_tonic_kurtosis", "eda_tonic_peaks", "eda_tonic_rms", 
            "eda_tonic_lineintegral", "eda_tonic_n_above_mean", "eda_tonic_n_below_mean", "eda_tonic_n_sign_changes", 
            "eda_tonic_iqr", "eda_tonic_iqr_5_95", "eda_tonic_pct_5", "eda_tonic_pct_95", "eda_tonic_entropy", 
            "eda_tonic_perm_entropy", "eda_tonic_svd_entropy", "eda_phasic_mean", "eda_phasic_std", "eda_phasic_min", 
            "eda_phasic_max", "eda_phasic_ptp", "eda_phasic_sum", "eda_phasic_energy", "eda_phasic_skewness", 
            "eda_phasic_kurtosis", "eda_phasic_peaks", "eda_phasic_rms", "eda_phasic_lineintegral", 
            "eda_phasic_n_above_mean", "eda_phasic_n_below_mean", "eda_phasic_n_sign_changes", "eda_phasic_iqr", 
            "eda_phasic_iqr_5_95", "eda_phasic_pct_5", "eda_phasic_pct_95", "eda_phasic_entropy", 
            "eda_phasic_perm_entropy", "eda_phasic_svd_entropy", "acc_acc_x_mean", "acc_acc_x_std", "acc_acc_x_min", 
            "acc_acc_x_max", "acc_acc_x_ptp", "acc_acc_x_sum", "acc_acc_x_energy", "acc_acc_x_skewness", 
            "acc_acc_x_kurtosis", "acc_acc_x_peaks", "acc_acc_x_rms", "acc_acc_x_lineintegral", 
            "acc_acc_x_n_above_mean", "acc_acc_x_n_below_mean", "acc_acc_x_n_sign_changes", "acc_acc_x_iqr", 
            "acc_acc_x_iqr_5_95", "acc_acc_x_pct_5", "acc_acc_x_pct_95", "acc_acc_x_entropy", "acc_acc_x_perm_entropy", 
            "acc_acc_x_svd_entropy", "acc_acc_y_mean", "acc_acc_y_std", "acc_acc_y_min", "acc_acc_y_max", 
            "acc_acc_y_ptp", "acc_acc_y_sum", "acc_acc_y_energy", "acc_acc_y_skewness", "acc_acc_y_kurtosis", 
            "acc_acc_y_peaks", "acc_acc_y_rms", "acc_acc_y_lineintegral", "acc_acc_y_n_above_mean", 
            "acc_acc_y_n_below_mean", "acc_acc_y_n_sign_changes", "acc_acc_y_iqr", "acc_acc_y_iqr_5_95", 
            "acc_acc_y_pct_5", "acc_acc_y_pct_95", "acc_acc_y_entropy", "acc_acc_y_perm_entropy", 
            "acc_acc_y_svd_entropy", "acc_acc_z_mean", "acc_acc_z_std", "acc_acc_z_min", "acc_acc_z_max", 
            "acc_acc_z_ptp", "acc_acc_z_sum", "acc_acc_z_energy", "acc_acc_z_skewness", "acc_acc_z_kurtosis", 
            "acc_acc_z_peaks", "acc_acc_z_rms", "acc_acc_z_lineintegral", "acc_acc_z_n_above_mean", 
            "acc_acc_z_n_below_mean", "acc_acc_z_n_sign_changes", "acc_acc_z_iqr", "acc_acc_z_iqr_5_95", 
            "acc_acc_z_pct_5", "acc_acc_z_pct_95", "acc_acc_z_entropy", "acc_acc_z_perm_entropy", 
            "acc_acc_z_svd_entropy", "acc_l2_mean", "acc_l2_std", "acc_l2_min", "acc_l2_max", "acc_l2_ptp", 
            "acc_l2_sum", "acc_l2_energy", "acc_l2_skewness", "acc_l2_kurtosis", "acc_l2_peaks", "acc_l2_rms", 
            "acc_l2_lineintegral", "acc_l2_n_above_mean", "acc_l2_n_below_mean", "acc_l2_n_sign_changes", 
            "acc_l2_iqr", "acc_l2_iqr_5_95", "acc_l2_pct_5", "acc_l2_pct_95", "acc_l2_entropy", "acc_l2_perm_entropy", 
            "acc_l2_svd_entropy"
            
        ]

        # Create a dictionary mapping old names to new names with "E4_" prefix
        rename_dict_e4 = {col: f"E4_{col}" for col in columns_to_rename_e4}

        # Rename columns
        merged_features.rename(columns=rename_dict_e4, inplace=True)

        # Reorder the DataFrame according to the new column order
        merged_row = merged_row[cols]

        # Define the order of movement
        columns_to_move = ['timediff_flirtnk', 'timediff_buttonloc', 'timestamp_flirt']

        # Get column index positions
        col_idx_window_end_time = merged_features.columns.get_loc('window_end_time') + 1  # Position right after 'window_end_time'

        # Remove columns from their original positions
        remaining_columns = [col for col in merged_features.columns if col not in columns_to_move]

        # Insert the columns in the correct place
        new_columns = remaining_columns[:col_idx_window_end_time] + columns_to_move + remaining_columns[col_idx_window_end_time:]   

        # Reorder dataframe
        merged_features = merged_features[new_columns]

        '''

        # Save the corrected DataFrame to a new CSV file in the same folder
        output_file_path = os.path.join(subdir, f"{p_id}_ratingsFeatures_baselcorr_17-3.csv")
        merged_features.to_csv(output_file_path, index=None)
        
        print(f"Saved merged features to: {output_file_path}")


Processing folder: 00-code_export
No file with 'ratings_HRV_baselinecorrected' in its name found in folder C:\Users\BootMR\Documents\data_export\00-code_export.
No file with 'flirtFeatures_30s_baselinecorrected' in its name found in folder C:\Users\BootMR\Documents\data_export\00-code_export.
ratings_HRV_baselinecorrected is missing or empty in folder C:\Users\BootMR\Documents\data_export\00-code_export.
Processing folder: 03
Loaded file: C:\Users\BootMR\Documents\data_export\03\03_ratings_HRV_baselinecorrected.csv
Loaded file: C:\Users\BootMR\Documents\data_export\03\03_flirtFeatures_30s_baselinecorrected.csv
Saved merged features to: C:\Users\BootMR\Documents\data_export\03\03_ratingsFeatures_baselcorr_17-3.csv
Processing folder: 04
No file with 'ratings_HRV_baselinecorrected' in its name found in folder C:\Users\BootMR\Documents\data_export\04.
Loaded file: C:\Users\BootMR\Documents\data_export\04\04_flirtFeatures_30s_baselinecorrected.csv
ratings_HRV_baselinecorrected is missing or

## Merge into one

In [8]:
## merge all individual files into 1 large

# Define the root directory and the output file path

output_file_path = os.path.join(parent_dir, "all_ratingsflirtneurokit.csv")

# Initialize a list to store DataFrames
dataframes = []

# Iterate over all subfolders
for subdir, _, files in os.walk(parent_dir):
    # Skip the root directory itself
    if subdir == parent_dir:
        continue
    
    # Extract the subfolder name
    subfolder_name = os.path.basename(subdir)
    
    # Check for files containing "FlirtNkFeatures" in the current subfolder
    for file in files:
        if "ratingsFeatures_baselcorr" in file:
            file_path = os.path.join(subdir, file)
            try:
                # Read the file into a DataFrame
                df = pd.read_csv(file_path)
                
                # Add the subfolder name as a new column
                df['p_id'] = subfolder_name
                
                # Reorder the columns
                cols = df.columns.tolist()
                reordered_cols = ['p_id', 'rating', 'timestamp_button'] + [col for col in cols if col not in ['p_id', 'rating', 'timestamp_button']]
                df = df[reordered_cols]
                
                # Append the DataFrame to the list
                dataframes.append(df)
                print(f"Loaded file: {file_path}, Rows: {len(df)}")
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")

# Concatenate all DataFrames
if dataframes:
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    # Save the merged DataFrame to a CSV file
    merged_df.to_csv(output_file_path, index=False)
    print(f"Saved merged DataFrame to: {output_file_path}, Total Rows: {len(merged_df)}")
else:
    print("No files with 'FlirtNkFeatures' found in any subfolder.")


Loaded file: C:\Users\BootMR\Documents\data_export\03\03_ratingsFeatures_baselcorr_17-3.csv, Rows: 8
Saved merged DataFrame to: C:\Users\BootMR\Documents\data_export\all_ratingsflirtneurokit.csv, Total Rows: 8
