## imports

In [None]:
import neurokit2 as nk
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt 
import xgboost as xgb
import shap
import scipy.stats as stats
from datetime import datetime, timedelta
import re
import flirt
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer

# Specify the path to the desired directory
parent_dir = r'<<< PLACE HERE DIRECTORY WITH DATASET >>>'

# Change the current working directory to the specified directory
os.chdir(parent_dir)

# Verify that the working directory has been changed
print("Current working directory:", os.getcwd())

# Function to load file into a DataFrame
def load_file_into_dataframe(folder_path, var, filetype, sep=','):
    var_files = [f for f in os.listdir(folder_path) if f.endswith(filetype) and var in f]
    
    if var_files:
        file_path = os.path.join(folder_path, var_files[0])
        try:
            df = pd.read_csv(file_path, sep=sep)
            print(f"Loaded file: {file_path}")
            return df
        except pd.errors.EmptyDataError:
            print(f"The file {file_path} is empty.")
            return None
        except Exception as e:
            print(f"Error occurred while reading the file {file_path}: {e}")
            return None
    else:
        print(f"No file with '{var}' in its name found in folder {folder_path}.")
        return None

# Suppress warnings
warnings.simplefilter("ignore", UserWarning)

  from .autonotebook import tqdm as notebook_tqdm


Current working directory: C:\Users\BootMR\Documents\data_export


In [2]:
####### worked well at 12th March

'''

Simple Summary of What the Script Does:

    Clean and Prepare Data:
        Remove bad columns: The clean_features() function is designed to remove any columns in the DataFrame that have too many missing values (more than 5%) or contain infinite values.
        Ensure numeric data: The ensure_numeric() function converts all columns to numeric types, forcing any non-numeric values (like strings) to become NaN (missing data).

    Process Each Subfolder:
        The script looks through a parent directory and processes each subfolder one by one.
        For each subfolder, it checks if the folder contains a .zip file.

    Extract Features from the Zip File:
        If a zip file is found in the subfolder, the script uses a tool (probably flirt.with_empatica) to compute features from the zip file. These features could include:
            HRV (Heart Rate Variability) features
            EDA (Electrodermal Activity) features
            Accelerometer (ACC) features
        The features are computed using windows of 30 seconds with a step size of 1 second between windows.

    Clean and Save Data:
        The script ensures that all features are numeric (using ensure_numeric()).
        It then saves the computed features to a CSV file in the same subfolder where the zip file was located, with a filename based on the subfolder name and zip file name.

    Error Handling:
        If anything goes wrong while processing a zip file, it catches the error and prints a message without crashing the entire script.

Purpose:

    The script processes zip files in subfolders, extracts time-based features (HRV, EDA, ACC), cleans the data, ensures it's numeric, and saves the results to CSV files.

'''

# Function to clean the DataFrame
def clean_features(df):
    threshold = 0.05  # 5% threshold
    columns_to_remove = [col for col in df.columns if df[col].isna().mean() > threshold or df[col].isin([float('inf'), float('-inf')]).mean() > threshold]
    if columns_to_remove:
        print(f"Removing columns due to high NaN/Inf values: {columns_to_remove}")
        df = df.drop(columns=columns_to_remove)
    return df

# Function to check and convert data to numeric
def ensure_numeric(data):
    for col in data.columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    return data

# Iterate over each subfolder in the parent directory
for subfolder_name in os.listdir(parent_dir):
    subfolder_path = os.path.join(parent_dir, subfolder_name)
    
    if os.path.isdir(subfolder_path):
        # Find the zip file in the subfolder
        zip_file = None
        for file_name in os.listdir(subfolder_path):
            if file_name.endswith('.zip'):
                zip_file = file_name
                break
        
        if zip_file:
            zip_file_path = os.path.join(subfolder_path, zip_file)
            
            try:
                for window_size in [30, 60, 120]:
                    output_csv_path = os.path.join(subfolder_path, f'{subfolder_name}_FlirtFeatures_{window_size}s.csv')
                    
                    # Check if the file already exists
                    if os.path.exists(output_csv_path):
                        print(f'Skipping {output_csv_path}, already exists.')
                        continue
                    print(subfolder_name, window_size)
                    # Compute features using flirt.with_empatica
                    features = flirt.with_.empatica(zip_file_path,
                                                    window_length=window_size,
                                                    window_step_size=1,
                                                    hrv_features=False,
                                                    eda_features=True,
                                                    acc_features=False)
                    
                    # Ensure all data is numeric
                    features = ensure_numeric(features)
                    
                    # Write the DataFrame to a CSV file
                    features.to_csv(output_csv_path, index=True)
                    print(f'Features saved to {output_csv_path}')
            
            except Exception as e:
                print(f"Error processing {zip_file_path}: {e}")


03 30


EDA features: 100%|██████████| 5250/5250 [00:45<00:00, 116.18it/s]


Unable to remove memmapped file
Features saved to C:\Users\BootMR\Documents\data_export\03\03_FlirtFeatures_30s.csv
03 60


EDA features: 100%|██████████| 5250/5250 [00:23<00:00, 221.30it/s]


Unable to remove memmapped file
Features saved to C:\Users\BootMR\Documents\data_export\03\03_FlirtFeatures_60s.csv
03 120


EDA features: 100%|██████████| 5250/5250 [00:36<00:00, 142.68it/s]


Unable to remove memmapped file
Features saved to C:\Users\BootMR\Documents\data_export\03\03_FlirtFeatures_120s.csv
04 30


EDA features: 100%|██████████| 5043/5043 [00:17<00:00, 283.09it/s]


Unable to remove memmapped file
Features saved to C:\Users\BootMR\Documents\data_export\04\04_FlirtFeatures_30s.csv
04 60


EDA features: 100%|██████████| 5043/5043 [00:19<00:00, 254.38it/s]


Unable to remove memmapped file
Features saved to C:\Users\BootMR\Documents\data_export\04\04_FlirtFeatures_60s.csv
04 120


EDA features: 100%|██████████| 5043/5043 [00:33<00:00, 149.86it/s]


Unable to remove memmapped file
Features saved to C:\Users\BootMR\Documents\data_export\04\04_FlirtFeatures_120s.csv


## merge with buttons

In [None]:

'''

load buttons file
store timestamp in variable
for 30s, find row with timestamp that's 15s before button timestamp
for 60s1, find row with timestamp that's 30s before button timestamp
for 60s2, find row with timestamp that's 20s before button timestamp
for 120s, find row with timestamp that's 60s before button timestamp

add rating to eda values of that row

run this 4 times while changing manually 

'''

## merge features from both packages

# Define the root directory and mastertimesheet path
parent_dir = r'C:\Users\BootMR\Documents\data_processed\CHECKIFYOUREALLYNEEDTHISFOLDER'

# List of p_ids to skip
#skip_p_ids = []  # Add any p_ids you want to skip
skip_p_ids = [f"{i:02}" for i in range(1)]

### load file function deleted

# Functions to clean timestamps
def remove_milliseconds(df, column_name):
    df[column_name] = pd.to_datetime(df[column_name]).dt.strftime('%Y-%m-%d %H:%M:%S')
    df[column_name] = pd.to_datetime(df[column_name])

def remove_timezone(df, column_name):
    df[column_name] = pd.to_datetime(df[column_name]).dt.tz_localize(None)

# Iterate through each subfolder in the root folder
for subdir, _, files in os.walk(parent_dir):
    if subdir == parent_dir:
        continue

    p_id = os.path.basename(subdir)
    
    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {subdir} (p_id {p_id})")
        continue
    
    if 'buttons.csv' in files:
        print(f"Processing folder: {p_id}")

        # Load ECG and buttons data
        flirt = load_file_into_dataframe(subdir, 'FlirtFeatures_120s', '.csv', ',')
        buttons = load_file_into_dataframe(subdir, 'buttons_gps.csv', '.csv', ',')
        
        if flirt is None or flirt.empty:
            print(f"ECG file is missing or empty in folder {subdir}.")
            continue
        
        if buttons is None or buttons.empty:
            print(f"Buttons file is missing or empty in folder {subdir}.")
            continue


        if flirt is not None and buttons is not None:
            # Convert 'timestamp' columns to datetime format
            #ecg['Phone timestamp'] = pd.to_datetime(ecg['Phone timestamp'])
            #buttons['timestamp_button'] = pd.to_datetime(buttons['timestamp_button'])

            flirt.rename(columns={flirt.columns[0]: 'timestamp'}, inplace=True)
            flirt['timestamp'] = pd.to_datetime(flirt['timestamp']).dt.tz_localize(None)
            flirt['timestamp'] += pd.Timedelta(hours=2)

            buttons['timestamp_button'] = pd.to_datetime(buttons['timestamp_button'])
                        

            # Check and create window_start_time if missing
            if 'window_start_time' not in buttons.columns:
                buttons['window_start_time'] = buttons['timestamp_button'] - pd.Timedelta(seconds=60)
            else:
                buttons['window_start_time'] = pd.to_datetime(buttons['window_start_time'])

            # Check and create window_end_time if missing
            if 'window_end_time' not in buttons.columns:
                buttons['window_end_time'] = buttons['timestamp_button'] + pd.Timedelta(seconds=60)
            else:
                buttons['window_end_time'] = pd.to_datetime(buttons['window_end_time'])

            merged_rows = []  # List to store the merged rows

            for idx, button_row in buttons.iterrows():
                timestamp = button_row['window_start_time']
                
                # Find the closest row in flirt_features by comparing the timestamp
                flirt['time_diff'] = (flirt['timestamp'] - timestamp).abs()
                
                # Find the row with the smallest time difference (nearest timestamp)
                closest_row = flirt.loc[flirt['time_diff'].idxmin()]
                
                # Create a new row combining button row with the closest flirt_features data
                merged_row = button_row.copy()
                
                # Drop 'window_start_time' and 'time_diff' from flirt_features data if they exist
                closest_row = closest_row.drop(['time_diff'], errors='ignore')
                
                # Concatenate the merged row with the closest_row
                merged_row = pd.concat([merged_row, closest_row])
                
                # Append the merged row to the list
                merged_rows.append(merged_row)

            # Convert the list of merged rows into a DataFrame
            merged_features = pd.DataFrame(merged_rows)

            # Optional: Drop 'time_diff' column if you don't need it
            merged_features.drop(columns=['time_diff'], errors='ignore', inplace=True)


            # Save the corrected DataFrame to a new CSV file in the same folder
            output_file_path = os.path.join(subdir, f"{p_id}_ratingsFlirtfeatures_120s.csv")
            merged_features.to_csv(output_file_path, index=None)
            print(f"Saved merged features to: {output_file_path}")

Processing folder: 02
Loaded file: C:\Users\BootMR\Documents\data_processed\CHECKIFYOUREALLYNEEDTHISFOLDER\02\02_FlirtFeatures_120s.csv
Loaded file: C:\Users\BootMR\Documents\data_processed\CHECKIFYOUREALLYNEEDTHISFOLDER\02\02_buttons_gps.csv
Saved merged features to: C:\Users\BootMR\Documents\data_processed\CHECKIFYOUREALLYNEEDTHISFOLDER\02\02_ratingsFlirtfeatures_120s.csv
Processing folder: 03
Loaded file: C:\Users\BootMR\Documents\data_processed\CHECKIFYOUREALLYNEEDTHISFOLDER\03\03_FlirtFeatures_120s.csv
Loaded file: C:\Users\BootMR\Documents\data_processed\CHECKIFYOUREALLYNEEDTHISFOLDER\03\03_buttons_gps.csv
Saved merged features to: C:\Users\BootMR\Documents\data_processed\CHECKIFYOUREALLYNEEDTHISFOLDER\03\03_ratingsFlirtfeatures_120s.csv
Processing folder: 04
Loaded file: C:\Users\BootMR\Documents\data_processed\CHECKIFYOUREALLYNEEDTHISFOLDER\04\04_FlirtFeatures_120s.csv
Loaded file: C:\Users\BootMR\Documents\data_processed\CHECKIFYOUREALLYNEEDTHISFOLDER\04\04_buttons_gps.csv
Sav

## merge into one

In [3]:
####### merge non-baselinecorrected

# Suffixes to group files by
suffixes = ['30s', '60s1', '60s2', '120s']

# Initialize a dictionary to hold dataframes for each suffix
merged_data = {suffix: [] for suffix in suffixes}

# Iterate through each subfolder in the root folder
for p_id in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, p_id)
    
    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder_path}")
        
        # Check each suffix group
        for suffix in suffixes:
            # Construct the filename pattern for the current suffix
            filename = f"{p_id}_ratingsFlirtfeatures_{suffix}.csv"
            
            file_path = os.path.join(folder_path, filename)
            
            # If the file exists, read it and append to the corresponding list
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                df.insert(0, 'p_id', p_id)  # Add subfolder name as the first column
                merged_data[suffix].append(df)
                print(f"Added {file_path} to merge group '{suffix}'")
            else:
                print(f"File {file_path} does not exist.")

# Merge the dataframes for each suffix and save to a new CSV file
for suffix, dfs in merged_data.items():
    if dfs:
        # Concatenate all dataframes in the list
        merged_df = pd.concat(dfs, ignore_index=True)
        
        # Save the merged dataframe to a new CSV file
        output_file_path = os.path.join(parent_dir, f"merged_Flirtfeatures_{suffix}.csv")
        merged_df.to_csv(output_file_path, index=False)
        print(f"Merged file saved: {output_file_path}")
    else:
        print(f"No files to merge for suffix '{suffix}'")


Processing folder: C:\Users\BootMR\Documents\data_export\00-code_export
File C:\Users\BootMR\Documents\data_export\00-code_export\00-code_export_ratingsFlirtfeatures_30s.csv does not exist.
File C:\Users\BootMR\Documents\data_export\00-code_export\00-code_export_ratingsFlirtfeatures_60s1.csv does not exist.
File C:\Users\BootMR\Documents\data_export\00-code_export\00-code_export_ratingsFlirtfeatures_60s2.csv does not exist.
File C:\Users\BootMR\Documents\data_export\00-code_export\00-code_export_ratingsFlirtfeatures_120s.csv does not exist.
Processing folder: C:\Users\BootMR\Documents\data_export\03
File C:\Users\BootMR\Documents\data_export\03\03_ratingsFlirtfeatures_30s.csv does not exist.
File C:\Users\BootMR\Documents\data_export\03\03_ratingsFlirtfeatures_60s1.csv does not exist.
File C:\Users\BootMR\Documents\data_export\03\03_ratingsFlirtfeatures_60s2.csv does not exist.
File C:\Users\BootMR\Documents\data_export\03\03_ratingsFlirtfeatures_120s.csv does not exist.
Processing fol