## Imports

In [None]:
import neurokit2 as nk
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt 
import xgboost as xgb
import shap
import scipy.stats as stats
from datetime import datetime, timedelta
import re
from scipy.stats import linregress, ttest_rel, wilcoxon, zscore, skew, kurtosis
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import warnings
import requests
import time

# Specify the path to the desired directory
parent_dir = r'<<< PLACE HERE DIRECTORY WITH DATASET >>>'

# Change the current working directory to the specified directory
os.chdir(parent_dir)

#all_ratingsflirtneurokit = pd.read_csv("all_ratingsflirtneurokit.csv")
#all_ratings = pd.read_csv("all_merged_cleaned_wspeed.csv")


#responses = pd.read_excel("responses-sociodem.xlsx")
mastertimesheet = pd.read_excel("mastertimesheet-4.xlsx")
time_cols = ['startt1', 'stopt1', 'startt2', 'stopt2', 'startt3', 'stopt3']
mastertimesheet[time_cols] = mastertimesheet[time_cols].apply(pd.to_datetime)

sociodem = pd.read_excel("responses-full-cleaned.xlsx")

# Verify that the working directory has been changed
print("Current working directory:", os.getcwd())

# Function to load file into a DataFrame
def load_file_into_dataframe(folder_path, var, filetype, sep=','):
    var_files = [f for f in os.listdir(folder_path) if f.endswith(filetype) and var in f]
    
    if var_files:
        file_path = os.path.join(folder_path, var_files[0])
        try:
            df = pd.read_csv(file_path, sep=sep)
            #print(f"Loaded file: {file_path}")

            # Check if 'p_id' column exists
            if 'p_id' in df.columns:
                # Add leading zeros to numbers between 0 and 10 in the 'p_id' column
                df['p_id'] = df['p_id'].apply(lambda x: f'{int(x):02d}' if isinstance(x, (int, float)) and 0 <= int(x) < 10 else x)
                #print("Added leading zeros to 'p_id' column for numbers between 0 and 10.")

            return df
        except pd.errors.EmptyDataError:
            print(f"The file {file_path} is empty.")
            return None
        except Exception as e:
            print(f"Error occurred while reading the file {file_path}: {e}")
            return None
    else:
        #print(f"No file with '{var}' in its name found in folder {folder_path}.")
        return None
        

  from .autonotebook import tqdm as notebook_tqdm


Current working directory: C:\Users\BootMR\Documents\data_export


In [2]:

# Suppress specific runtime warnings
warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*mean of empty slice.*")
warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*invalid value encountered in divide.*")
warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*Degrees of freedom <= 0 for slice.*")
warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*divide by zero encountered in divide.*")
warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*invalid value encountered in multiply.*")
warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*invalid value encountered in scalar divide.*")
# Suppress specific warnings
warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*Precision loss occurred.*")



## clean columns

In [10]:
# worked well 17-3

skip_p_ids = [f"{i:02}" for i in range(1)]


# Iterate through each subfolder in the root folder
for p_id in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, p_id)
    
    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {folder_path} (p_id {p_id})")
        continue
    
    if os.path.isdir(folder_path):
        
        print(f"Processing folder: {folder_path}")
        
        # Initialize an empty DataFrame to store the results for the current p_id
        button_features = pd.DataFrame()
        
        # Load ECG and buttons data
         
        ratings_features = load_file_into_dataframe(folder_path, 'ratingsFeatures_baselcorr_17-3', '.csv', ',')

        if ratings_features is None or ratings_features.empty:
            print(f"ratings_features missing or empty in folder {folder_path}.")
            continue
        
        #if ratings_wHRV is None or ratings_wHRV.empty:
        #    print(f"ratings_wHRV missing or empty in folder {folder_path}.")
        #    continue
        
        if ratings_features is not None:

            ####### remove and reorder columns

            # 1. Delete specified columns
            columns_to_delete = ['hr_mean.1', 'hr_stdev.1', 'hrv_mean.1', 'hrv_stdev.1', 'time_diff']
            ratings_features.drop(columns=columns_to_delete, inplace=True, errors='ignore')

            # 2. Add prefix "H10_" to specific columns
            columns_to_prefix = ["hr_stdev", "hr_mean", "hrv_mean", "hrv_stdev"]
            ratings_features.rename(columns={col: f"H10_{col}" for col in columns_to_prefix if col in ratings_features.columns}, inplace=True)

            # 3. Add suffix "_H10" to all columns that start with "HRV_"
            ratings_features.rename(columns={col: f"H10_{col}" for col in ratings_features.columns if col.startswith("HRV_")}, inplace=True)

            # 4. Add suffix "_E4" to all columns that start with "hrv_", "eda_", or "acc_"
            ratings_features.rename(columns={col: f"E4_{col}" for col in ratings_features.columns if col.startswith(("num_", "hrv_", "eda_", "acc_"))}, inplace=True)

            # 5. Rename "time_difference" to "timediff_buttonloc"
            ratings_features.rename(columns={"time_difference": "timediff_buttonloc"}, inplace=True)

            # 6. Move "timediff_flirtnk" to be between "timediff_buttonloc" and "window_start_time"
            columns = list(ratings_features.columns)
            if "timediff_flirtnk" in columns and "timediff_buttonloc" in columns and "window_start_time" in columns:
                columns.remove("timediff_flirtnk")
                timediff_buttonloc_index = columns.index("timediff_buttonloc")
                columns.insert(timediff_buttonloc_index + 1, "timediff_flirtnk")
                ratings_features = ratings_features[columns]

            # 7. Move column timestamp_flirt
            columns = list(ratings_features.columns)
            if "timestamp_location" in columns and "latitude" in columns and "timestamp_flirt" in columns:
                columns.remove("timestamp_flirt")
                timediff_buttonloc_index = columns.index("timestamp_location")
                columns.insert(timediff_buttonloc_index + 1, "timestamp_flirt")
                ratings_features = ratings_features[columns]

            # 8. Rename and reorder first set of descriptive columns

            # Rename columns first
            rename_mapping = {
                "timediff_buttonloc": "timestamp_timediff_buttonloc",
                "timediff_flirtnk": "timestamp_timediff_flirtnk",
                "window_start_time": "timestamp_window_start_time",
                "window_end_time": "timestamp_window_end_time"
            }
            ratings_features = ratings_features.rename(columns=rename_mapping)

            # Define the desired order (after renaming)
            desired_order = [
                "rating", 
                "latitude", "longitude", 
                "timestamp_button", "timestamp_location", "timestamp_flirt", 
                "timestamp_timediff_buttonloc", "timediff_flirtnk", 
                "timestamp_window_start_time", "timestamp_window_end_time"
            ]

            # Ensure all desired columns exist before reordering
            existing_columns = list(ratings_features.columns)
            ordered_columns = [col for col in desired_order if col in existing_columns] + [
                col for col in existing_columns if col not in desired_order
            ]

            # Apply reordering
            ratings_features = ratings_features[ordered_columns]

            
            # Save the corrected DataFrame to a new CSV file in the same folder
            output_file_path = os.path.join(folder_path, f"{p_id}_ratingsFeatures_baselcorr_17-3_C1.csv")
            ratings_features.to_csv(output_file_path, index=None) 
            print(f"Saved merged features to: {output_file_path}")



Processing folder: C:\Users\BootMR\Documents\data_export\00-code_export
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\00-code_export.
Processing folder: C:\Users\BootMR\Documents\data_export\03
Saved merged features to: C:\Users\BootMR\Documents\data_export\03\03_ratingsFeatures_baselcorr_17-3_C1.csv
Processing folder: C:\Users\BootMR\Documents\data_export\04
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\04.


## add warning data

In [11]:
### worked well 5th Feb. script to add warning data to baseline corrected features
######## executed well already at 5th feb; be careful about executing it again as it will give errors
# still worked well at 17-3

# List of p_ids to skip
skip_p_ids = [f"{i:02}" for i in range(1)]

for p_id in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, p_id)
    
    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {folder_path} (p_id {p_id})")
        continue
    
    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder_path}")
        
        # Load data
        ratings_features = load_file_into_dataframe(folder_path, 'ratingsFeatures_baselcorr_17-3_C1', '.csv', ',')
        warning_data = load_file_into_dataframe(folder_path, 'warning_data_merged.csv', '.csv', ',')

        if ratings_features is None or ratings_features.empty:
            print(f"ratings_features missing or empty in folder {folder_path}.")
            continue
        
        if warning_data is None or warning_data.empty:
            print(f"warning_data missing or empty in folder {folder_path}.")
            continue
        
        if ratings_features is not None and warning_data is not None:
            ratings_features['timestamp_button'] = pd.to_datetime(ratings_features['timestamp_button'], utc=True).dt.tz_localize(None)
            warning_data['timestamp'] = pd.to_datetime(warning_data['timestamp'], utc=True).dt.tz_localize(None)

            for index, row in ratings_features.iterrows():
                window_start_time = row['timestamp_window_start_time']
                window_end_time = row['timestamp_window_end_time']

                # Filter for timestamps within the time window
                warnings_in_window = warning_data[
                    (warning_data['timestamp'] >= window_start_time) &
                    (warning_data['timestamp'] <= window_end_time) &
                    (warning_data['warning_type'] == "SlowDown")  # Only select "SlowDown" warnings
                ]

                slowdown_count = len(warnings_in_window)
                audio_warning = int(warnings_in_window['audio_warning'].any())  
                tactile_warning = int(warnings_in_window['tactile_warning'].any())  

                # Compute "warningsonoff" column
                warningsonoff = 1 if (audio_warning or tactile_warning) else 0

                # Compute "warning_type_text" column
                if audio_warning and tactile_warning:
                    warning_type_text = "audio, tactile"
                elif audio_warning:
                    warning_type_text = "audio"
                elif tactile_warning:
                    warning_type_text = "tactile"
                else:
                    warning_type_text = "none"

                # Store unique p_id and round_id values
                p_id_values = warnings_in_window['p_id'].unique()
                round_id_values = warnings_in_window['round_id'].unique()

                p_id_str = ",".join(map(str, p_id_values)) if p_id_values.size > 0 else ""
                round_id_str = ",".join(map(str, round_id_values)) if round_id_values.size > 0 else ""

                # Update ratings_features
                ratings_features.at[index, "warnings_slowdown_count"] = slowdown_count
                ratings_features.at[index, "warnings_audio_warning"] = audio_warning
                ratings_features.at[index, "warnings_tactile_warning"] = tactile_warning
                ratings_features.at[index, "warnings_onoff"] = warningsonoff
                ratings_features.at[index, "warnings_type_text"] = warning_type_text

            # Ensure correct column order
            desired_columns = list(ratings_features.columns)  

            # Find indices of key positions
            idx_start = desired_columns.index("timestamp_timediff_flirtnk") + 1
            idx_end = desired_columns.index("H10_hr_mean")

            # List of new columns to insert
            new_columns = ["warnings_slowdown_count", "warnings_audio_warning", "warnings_tactile_warning", "warnings_onoff", "warnings_type_text"]

            # Remove duplicates if they already exist
            desired_columns = [col for col in desired_columns if col not in new_columns]

            # Reinsert the warning columns at the correct position
            reordered_columns = (
                desired_columns[:idx_start] + new_columns + desired_columns[idx_start:idx_end] + desired_columns[idx_end:]
            )

            # Apply the new column order
            ratings_features = ratings_features[reordered_columns]

            # Save the corrected DataFrame to a new CSV file
            output_file_path = os.path.join(folder_path, f"{p_id}_ratingsFeatures_baselcorr_17-3_C1W.csv")
            ratings_features.to_csv(output_file_path, index=None) 
            print(f"Saved merged features to: {output_file_path}")


Processing folder: C:\Users\BootMR\Documents\data_export\00-code_export
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\00-code_export.
Processing folder: C:\Users\BootMR\Documents\data_export\03
Saved merged features to: C:\Users\BootMR\Documents\data_export\03\03_ratingsFeatures_baselcorr_17-3_C1W.csv
Processing folder: C:\Users\BootMR\Documents\data_export\04
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\04.


## add p_id and interval_id

In [13]:
######## executed well already at 5th feb; be careful about executing it again as it will give errors

# List of p_ids to skip
#skip_p_ids = []  # Add any p_ids you want to skip
skip_p_ids = [f"{i:02}" for i in range(1)]

# Define a function to determine the interval_id for each timestamp
def get_interval_id(timestamp, mastertimesheet_row):
    if mastertimesheet_row['startt1'] <= timestamp <= mastertimesheet_row['stopt1']:
        return 1
    elif mastertimesheet_row['startt2'] <= timestamp <= mastertimesheet_row['stopt2']:
        return 2
    elif mastertimesheet_row['startt3'] <= timestamp <= mastertimesheet_row['stopt3']:
        return 3
    else:
        return np.nan  # If the timestamp doesn't fall in any interval, return NaN

# Iterate through each subfolder in the root folder
for p_id in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, p_id)
    
    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {folder_path} (p_id {p_id})")
        continue
    
    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder_path}")
        
        # Load data
        ratings_features = load_file_into_dataframe(folder_path, 'ratingsFeatures_baselcorr_17-3_C1W', '.csv', ',')
        
        # Ensure ratings_features is valid before proceeding
        if ratings_features is None or ratings_features.empty:
            print(f"⚠️ ratings_features missing or empty in folder {folder_path}. Skipping...")
            continue

        # Convert timestamp column
        ratings_features['timestamp_button'] = pd.to_datetime(ratings_features['timestamp_button'], utc=True).dt.tz_localize(None)

        # Initialize new columns
        ratings_features['p_id'] = p_id  # Assign current p_id
        ratings_features['interval_id'] = np.nan  # Initialize interval_id column
        
        # Compute rating_totalperpid (total ratings per p_id)
        total_ratings_per_p_id = len(ratings_features)
        ratings_features['rating_totalperpid'] = total_ratings_per_p_id  # Added only once here

        # Ensure p_id, interval_id, and rating columns are placed correctly
        column_order = ['p_id', 'interval_id', 'rating'] + [col for col in ratings_features.columns if col not in ['p_id', 'interval_id', 'rating']] 
        column_order.insert(column_order.index('rating') + 1, 'rating_totalperpid')  # Insert 'rating_totalperpid' after 'rating'
        
        
        # Iterate through each row of ratings_features to determine interval_id
        for i, rating_row in ratings_features.iterrows():
            timestamp = rating_row['timestamp_button']
            
            # Iterate through mastertimesheet to find the corresponding interval_id
            for j, master_row in mastertimesheet.iterrows():
                interval_id = get_interval_id(timestamp, master_row)
                if not pd.isna(interval_id):  # If an interval was found, assign it and break the loop
                    ratings_features.at[i, 'interval_id'] = interval_id
                    break

        # Apply the new column order
        ratings_features = ratings_features[column_order]

        # Save the corrected DataFrame to a new CSV file in the same folder
        output_file_path = os.path.join(folder_path, f"{p_id}_ratingsFeatures_baselcorr_17-3_C1WP.csv")
        ratings_features.to_csv(output_file_path, index=None) 
        print(f"Saved merged features to: {output_file_path}")


Processing folder: C:\Users\BootMR\Documents\data_export\00-code_export
⚠️ ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\00-code_export. Skipping...
Processing folder: C:\Users\BootMR\Documents\data_export\03
Saved merged features to: C:\Users\BootMR\Documents\data_export\03\03_ratingsFeatures_baselcorr_17-3_C1WP.csv
Processing folder: C:\Users\BootMR\Documents\data_export\04
⚠️ ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\04. Skipping...


## add cadence data

In [15]:
# List of p_ids to skip
skip_p_ids = [f"{i:02}" for i in range(1)]

# Iterate through each subfolder in the root folder
for p_id in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, p_id)
    
    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {folder_path} (p_id {p_id})")
        continue
    
    if os.path.isdir(folder_path):
        
        print(f"Processing folder: {folder_path}")
                
        ratings_features = load_file_into_dataframe(folder_path, 'ratingsFeatures_baselcorr_17-3_C1WP', 'csv', ',')
        cadence = load_file_into_dataframe(folder_path, '_cadence_merged', 'csv', ',')

        if ratings_features is None or ratings_features.empty:
            print(f"ratings_features missing or empty in folder {folder_path}.")
            continue

        if cadence is None or cadence.empty:
            print(f"cadence missing or empty in folder {folder_path}. Leaving cadence columns empty.")
            # If cadence file is missing, leave cadence columns empty
            cadence_columns = [
                "cadence_avg", "cadence_min", "cadence_max", "cadence_std", 
                "cadence_avgacc", "cadence_slope", "cadence_cv", 
                "cadence_skewness", "cadence_kurtosis", "cadence_pvr", 
                "cadence_autocorrelation"
            ]
            for col in cadence_columns:
                ratings_features[col] = np.nan  # Set cadence columns to NaN (empty)
        else:
            cadence['timestamp'] = pd.to_datetime(cadence['timestamp'])
            results = []
            
            # Loop through each rating in the selected DataFrame
            for index, row in ratings_features.iterrows():
                window_start_time = row['timestamp_window_start_time']
                window_end_time = row['timestamp_window_end_time']

                cadence_in_window = cadence[(cadence['timestamp'] >= window_start_time) &
                                            (cadence['timestamp'] <= window_end_time)]

                avg_cadence = round(cadence_in_window['cadence_tpmn'].mean(), 1)
                min_cadence = round(cadence_in_window['cadence_tpmn'].min(), 1)
                max_cadence = round(cadence_in_window['cadence_tpmn'].max(), 1)
                std_cadence = round(cadence_in_window['cadence_tpmn'].std(), 1)

                cadence_diff = cadence_in_window['cadence_tpmn'].diff()
                avg_acceleration = round(cadence_diff.mean(), 1)

                cadence_in_window_clean = cadence_in_window.dropna(subset=['cadence_tpmn'])
                if not cadence_in_window_clean.empty:
                    time = np.arange(len(cadence_in_window_clean)).reshape(-1, 1)
                    from sklearn.linear_model import LinearRegression
                    model = LinearRegression().fit(time, cadence_in_window_clean['cadence_tpmn'])
                    slope = round(model.coef_[0], 3)
                else:
                    slope = np.nan

                cv_cadence = round(cadence_in_window['cadence_tpmn'].std() / cadence_in_window['cadence_tpmn'].mean() * 100, 1)

                from scipy.stats import skew, kurtosis
                cadence_skewness = round(skew(cadence_in_window['cadence_tpmn']), 2)
                cadence_kurtosis = round(kurtosis(cadence_in_window['cadence_tpmn']), 2)

                peaks = cadence_in_window['cadence_tpmn'][cadence_in_window['cadence_tpmn'] == cadence_in_window['cadence_tpmn'].rolling(3).max()]
                valleys = cadence_in_window['cadence_tpmn'][cadence_in_window['cadence_tpmn'] == cadence_in_window['cadence_tpmn'].rolling(3).min()]
                peak_valley_ratio = round(len(peaks) / len(valleys) if len(valleys) > 0 else 0, 2)

                autocorr = round(np.corrcoef(cadence_in_window['cadence_tpmn'], cadence_in_window['cadence_tpmn'].shift(1))[0, 1], 2)

                # Append the results to ratings_features
                ratings_features.at[index, "cadence_avg"] = avg_cadence
                ratings_features.at[index, "cadence_min"] = min_cadence
                ratings_features.at[index, "cadence_max"] = max_cadence
                ratings_features.at[index, "cadence_std"] = std_cadence
                ratings_features.at[index, "cadence_avgacc"] = avg_acceleration
                ratings_features.at[index, "cadence_slope"] = slope
                ratings_features.at[index, "cadence_cv"] = cv_cadence
                ratings_features.at[index, "cadence_skewness"] = cadence_skewness
                ratings_features.at[index, "cadence_kurtosis"] = cadence_kurtosis
                ratings_features.at[index, "cadence_pvr"] = peak_valley_ratio
                ratings_features.at[index, "cadence_autocorrelation"] = autocorr

        # Save the corrected DataFrame to a new CSV file in the same folder
        output_file_path = os.path.join(folder_path, f"{p_id}_ratingsFeatures_baselcorr_17-3_C1WPC.csv")
        ratings_features.to_csv(output_file_path, index=None)
        print(f"Saved merged features to: {output_file_path}")


Processing folder: C:\Users\BootMR\Documents\data_export\00-code_export
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\00-code_export.
Processing folder: C:\Users\BootMR\Documents\data_export\03
cadence missing or empty in folder C:\Users\BootMR\Documents\data_export\03. Leaving cadence columns empty.
Saved merged features to: C:\Users\BootMR\Documents\data_export\03\03_ratingsFeatures_baselcorr_17-3_C1WPC.csv
Processing folder: C:\Users\BootMR\Documents\data_export\04
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\04.


## add speed

In [17]:
#### TO BE DONE

#took script from add cadence, let's re use it


# List of p_ids to skip
#skip_p_ids = []  # Add any p_ids you want to skip
skip_p_ids = [f"{i:02}" for i in range(1)]

# Iterate through each subfolder in the root folder
for p_id in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, p_id)
    
    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {folder_path} (p_id {p_id})")
        continue
    
    if os.path.isdir(folder_path):
        
        print(f"Processing folder: {folder_path}")
                
        ratings_features = load_file_into_dataframe(folder_path, 'ratingsFeatures_baselcorr_17-3_C1WPC', 'csv', ',')
        location_data = load_file_into_dataframe(folder_path, '_location_data_merged', 'csv', ',')

        if ratings_features is None or ratings_features.empty:
            print(f"ratings_features missing or empty in folder {folder_path}.")
            continue

        if location_data is None or location_data.empty:
            print(f"cadence missing or empty in folder {folder_path}.")
            continue 

        results = []
        
        if ratings_features is not None and location_data is not None:

            # Loop through each rating in the selected DataFrame
            for index, row in ratings_features.iterrows():
                # Store window_start_time and window_end_time
                window_start_time = row['timestamp_window_start_time']
                window_end_time = row['timestamp_window_end_time']

                # Filter warnings based on the time window
                location_in_window = location_data[(location_data['timestamp'] >= window_start_time) &
                                                        (location_data['timestamp'] <= window_end_time)]

                if location_in_window.empty:
                        continue
                
                # Velocity Features
                avg_velocity = round(location_in_window['velocity'].mean(), 2)
                min_velocity = round(location_in_window['velocity'].min(), 2)
                max_velocity = round(location_in_window['velocity'].max(), 2)
                std_velocity = round(location_in_window['velocity'].std(), 2)
                
                velocity_diff = location_in_window['velocity'].diff()
                avg_velocity_change = round(velocity_diff.mean(), 2)
                velocity_cv = round(std_velocity / avg_velocity * 100 if avg_velocity else 0, 2)
                
                velocity_skewness = round(skew(location_in_window['velocity'].dropna()), 2)
                velocity_kurtosis = round(kurtosis(location_in_window['velocity'].dropna()), 2)
                
                # Linear Regression on Velocity (Trend Analysis)
                velocity_clean = location_in_window.dropna(subset=['velocity'])
                if not velocity_clean.empty:
                    time = np.arange(len(velocity_clean)).reshape(-1, 1)
                    model = LinearRegression().fit(time, velocity_clean['velocity'])
                    velocity_slope = round(model.coef_[0], 3)
                else:
                    velocity_slope = np.nan

                ratings_features.at[index, "velocity_avg"] = avg_velocity
                ratings_features.at[index, "velocity_min"] = min_velocity
                ratings_features.at[index, "velocity_max"] = max_velocity
                ratings_features.at[index, "velocity_std"] = std_velocity
                ratings_features.at[index, "velocity_avg_change"] = avg_velocity_change
                ratings_features.at[index, "velocity_cv"] = velocity_cv
                ratings_features.at[index, "velocity_skewness"] = velocity_skewness
                ratings_features.at[index, "velocity_kurtosis"] = velocity_kurtosis
                ratings_features.at[index, "velocity_slope"] = velocity_slope

            # Save the corrected DataFrame to a new CSV file in the same folder
            output_file_path = os.path.join(folder_path, f"{p_id}_ratingsFeatures_baselcorr_17-3_C1WPCS.csv")
            ratings_features.to_csv(output_file_path, index=None) 
            print(f"Saved merged features to: {output_file_path}")

Processing folder: C:\Users\BootMR\Documents\data_export\00-code_export
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\00-code_export.
Processing folder: C:\Users\BootMR\Documents\data_export\03
Saved merged features to: C:\Users\BootMR\Documents\data_export\03\03_ratingsFeatures_baselcorr_17-3_C1WPCS.csv
Processing folder: C:\Users\BootMR\Documents\data_export\04
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\04.


## add weather? 

In [19]:
### short script to use weather data from initial files, without pulling new requests via API 

# Variable to track the overall status
all_match = True

# Iterate through each subfolder
for subfolder in os.listdir(parent_dir):
    subfolder_path = os.path.join(parent_dir, subfolder)

    # Ensure the path is a directory and subfolder is a numeric name
    if os.path.isdir(subfolder_path) and subfolder.isdigit():
        # Construct file paths
        file1 = os.path.join(subfolder_path, f"{subfolder}_ratingsFeatures_baselcorr_17-3_C1WPCS.csv")
        file2 = os.path.join(subfolder_path, f"{subfolder}_ratingsFeatures_baselcorr_extended-WEATHER.csv")

        # Check if both files exist in the subfolder
        if os.path.exists(file1) and os.path.exists(file2):
            # Load the files
            try:
                df1 = pd.read_csv(file1)
                df2 = pd.read_csv(file2)
                
                # Check row counts
                rows_file1 = len(df1)
                rows_file2 = len(df2)
                
                if rows_file1 != rows_file2:
                    print(f"Row count mismatch in subfolder {subfolder}:")
                    print(f"  {file1} has {rows_file1} rows")
                    print(f"  {file2} has {rows_file2} rows")
                    all_match = False
                else:
                    # Merge the two dataframes on index (assuming they correspond row-wise)
                    weather_columns = ['temperature', 'wind_speed', 'wind_direction', 'humidity', 'cloudiness', 'rain_3h']
                    
                    # Add a suffix 'weather_' to the column names
                    df2_weather = df2[weather_columns].add_prefix('weather_')
                                        
                    # Concatenate weather columns to the first dataframe
                    df_merged = pd.concat([df1, df2_weather], axis=1)

                    # Save the merged dataframe to a new file
                    output_file_path = os.path.join(subfolder_path, f"{subfolder}_ratingsFeatures_baselcorr_17-3_C1WPCSW.csv")
                    df_merged.to_csv(output_file_path, index=False)
                    print(f"Saved merged file with weather data to: {output_file_path}")
            
            except Exception as e:
                print(f"Error processing files in subfolder {subfolder}: {e}")
        else:
            test = 0
            #print(f"Skipping subfolder {subfolder} as one or both files are missing.")

# Final conclusion
if all_match:
    print("Done and All folders with both files contain the same number of rows.")
else:
    print("Some folders with both files do not contain the same number of rows.")


Done and All folders with both files contain the same number of rows.


In [23]:
# complete script to pull weather data via api for each rating


# OpenWeatherMap API key
API_KEY = '28269a413589e6c976a00aefd4bdf6b5'
BASE_URL = 'https://history.openweathermap.org/data/2.5/history/city'

def get_weather_data(lat, lon, timestamp):
    # Convert timestamp to Unix time (seconds)
    start = int(timestamp.timestamp())
    end = start + 3600  # 1 hour later (to ensure data is captured)

    url = f"{BASE_URL}?lat={lat}&lon={lon}&type=hour&start={start}&end={end}&appid={API_KEY}&units=metric"

    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        if 'list' in data and len(data['list']) > 0:
            # Take the closest weather data (assuming sorted by time)
            weather = data['list'][0]
            temp = weather['main']['temp']
            wind_speed = weather['wind']['speed']
            wind_direction = weather['wind']['deg']
            humidity = weather['main']['humidity']
            cloudiness = weather['clouds']['all']
            rain = weather.get('rain', {}).get('3h', 0)  # Default to 0 if no rain data

            return temp, wind_speed, wind_direction, humidity, cloudiness, rain
        else:
            return None, None, None, None, None, None

    except Exception as e:
        print(f"Error fetching weather data for {lat}, {lon} at {timestamp}: {e}")
        return None, None, None, None, None, None


# List of p_ids to skip
#skip_p_ids = []  # Add any p_ids you want to skip
skip_p_ids = [f"{i:02}" for i in range(1)]

# Iterate through each subfolder in the root folder
for p_id in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, p_id)
    
    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {folder_path} (p_id {p_id})")
        continue
    
    if os.path.isdir(folder_path):
        
        print(f"Processing folder: {folder_path}")
                
        ratings_features = load_file_into_dataframe(folder_path, 'ratingsFeatures_baselcorr_17-3_C1WPCS', 'csv', ',')

        if ratings_features is None or ratings_features.empty:
            print(f"ratings_features missing or empty in folder {folder_path}.")
            continue

        ratings_features['timestamp_button'] = pd.to_datetime(ratings_features['timestamp_button'], errors='coerce')

        results = []
        
        if ratings_features is not None:

            # Add empty columns for weather data
            ratings_features['temperature'] = None
            ratings_features['wind_speed'] = None
            ratings_features['wind_direction'] = None
            
            ratings_features['humidity'] = None
            ratings_features['cloudiness'] = None
            ratings_features['rain_3h'] = None

            # Loop through each rating in the selected DataFrame
            for index, row in ratings_features.iterrows():

                temp, wind_speed, wind_direction, humidity, cloudiness, rain = get_weather_data(row['latitude'], row['longitude'], row['timestamp_button'])

                # To avoid exceeding API rate limits
                time.sleep(1)
                print(row['latitude'], row['longitude'], row['timestamp_button'])
                print(index, temp, wind_speed, wind_direction, humidity, cloudiness, rain)

                ratings_features.at[index, 'weather_temperature'] = temp
                ratings_features.at[index, 'weather_wind_speed'] = wind_speed
                ratings_features.at[index, 'weather_wind_direction'] = wind_direction
                ratings_features.at[index, 'weather_humidity'] = humidity
                ratings_features.at[index, 'weather_cloudiness'] = cloudiness
                ratings_features.at[index, 'weather_rain_3h'] = rain


            # Save the corrected DataFrame to a new CSV file in the same folder
            output_file_path = os.path.join(folder_path, f"{p_id}_ratingsFeatures_baselcorr_17-3_C1WPCSW.csv")
            ratings_features.to_csv(output_file_path, index=None)
            print(f"Saved merged features to: {output_file_path}")

Processing folder: C:\Users\BootMR\Documents\data_export\00-code_export
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\00-code_export.
Processing folder: C:\Users\BootMR\Documents\data_export\03
52.23802608333334 6.8573067666666665 2024-05-22 11:43:04.127494
0 17.75 3.58 273 76 100 0
52.231734633333325 6.866081649999999 2024-05-22 11:47:22.196925
1 17.75 3.58 273 76 100 0
52.23099665 6.859504600000001 2024-05-22 11:50:47.987902
2 17.75 3.58 273 76 100 0
52.23502611666666 6.858946116666668 2024-05-22 12:06:18.554372
3 17.26 4.02 254 77 100 0
52.2322988 6.864600983333334 2024-05-22 12:07:35.941486
4 17.26 4.02 254 77 100 0
52.23219774999999 6.864867783333334 2024-05-22 12:07:38.644613
5 17.26 4.02 254 77 100 0
52.23620081666666 6.858800366666666 2024-05-22 12:27:01.459779
6 17.26 4.02 254 77 100 0
52.23226471666667 6.864698216666667 2024-05-22 12:28:42.473445
7 17.26 4.02 254 77 100 0
Saved merged features to: C:\Users\BootMR\Documents\data_export\03\03

## add sociodemographics

In [25]:

# Assuming 'P_ID' in sociodem and 'p_id' in ratings_features are the linking columns

# Create a dictionary to map the old column names to more meaningful lower case names
column_name_mapping = {
    'P_ID': 'P_ID',
    'Nationality': 'sociodem_nationality',
    'gender': 'sociodem_gender',
    'age': 'sociodem_age',
    'weight': 'sociodem_weight',
    'length': 'sociodem_length',
    'coffee': 'sociodem_coffee',
    'medication': 'sociodem_medication',
    'income net': 'sociodem_income',
    'education': 'sociodem_education',
    'Hoeveel jaar heeft u tenminste eenmaal per jaar in Nederland gefietst?': 'sociodem_cycling_experience',
    'Bij hoeveel fietsongevallen met andere voertuigen (bijv. fiets, auto) bent u betrokken geweest in de afgelopen 3 jaar': 'sociodem_bike_accidents',
    'E-bike': 'sociodem_ebike',
    'safety_predisposition': 'sociodem_safety_predisposition',
    'nervousness_predisposition': 'sociodem_nervousness_predisposition',
    'digitalDisposition': 'sociodem_digital_disposition',
    'arousalDisposition': 'sociodem_arousal_disposition',
    'pleasantnessDisposition': 'sociodem_pleasantness_disposition',
    'flowDisposition': 'sociodem_flow_disposition', 
    'context_positivity_1': 'context_positivity_1',
    'warning_value_1': 'warning_value_1', 
    'context_positivity_1': 'context_positivity_1',
    'warning_value_2': 'warning_value_2', 
    'context_positivity_2': 'context_positivity_2',
    'warning_value_3': 'warning_value_3', 
    'context_positivity_3': 'context_positivity_3'
}

# Define mapping dictionaries
column_numerical_mapping = {
    "sociodem_income": {'0-1000': 0, '1001-1500': 1, '1501-2500': 2, '2501-3000': 3, 
                '3001-3500': 4, '3501-4000': 5, '4001-4500': 6},

    "sociodem_education": {'Middle/high school': 0, 'Vocational education': 1, 'Academic education': 2},

    "sociodem_cycling_experience": {'1-2': 0, '5-10': 1, '10+': 2},

    "sociodem_weight": {'<60': 60, '61-70': 65.5, '71-80': 75.5, '81-90': 85.5, '101-110': 105.5},

    "sociodem_length": {'161-170': 165.5, '171-180': 175.5, '181-190': 185.5, '191-200': 195.5},

    "sociodem_pleasantness_disposition": {'Rarely': 1, 'Sometimes': 2, 'Often': 3, 'Very often': 4},

    "sociodem_safety_predisposition": {"Strongly agree": 3, "Agree": 2, "Neutral": 1},

    "sociodem_age": {'<25': 15, '25-35': 30, "36-45": 40, '46-55': 50, '56-65': 60, '66-75': 70, '75<': 75}, 

    "sociodem_nervousness_predisposition": {"Strongly disagree": 3, "Disagree": 2, "Neutral": 1},

    "sociodem_arousal_disposition": {"Never": 1, "Rarely": 2, "Sometimes": 3, "Often": 4, "Very often": 5},

    "sociodem_flow_disposition": {"Rarely": 1, "Sometimes": 2, "Often": 3, "Very often": 4},


}

# Select and rename the columns based on the mapping
sociodem_subset = sociodem[list(column_name_mapping.keys())]

# Rename columns using the column_mapping
sociodem_subset.rename(columns=column_name_mapping, inplace=True)



# List of p_ids to skip
#skip_p_ids = []  # Add any p_ids you want to skip
skip_p_ids = [f"{i:02}" for i in range(1)]

# Iterate through each subfolder in the root folder
for p_id in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, p_id)
    
    if p_id in skip_p_ids:
        print(f"Skipping folder as instructed: {folder_path} (p_id {p_id})")
        continue
    
    if os.path.isdir(folder_path):
        
        print(f"Processing folder: {folder_path}")
                
        ratings_features = load_file_into_dataframe(folder_path, 'ratingsFeatures_baselcorr_17-3_C1WPCSW', 'csv', ',')

        if ratings_features is None or ratings_features.empty:
            print(f"ratings_features missing or empty in folder {folder_path}.")
            continue

        ratings_features['timestamp_button'] = pd.to_datetime(ratings_features['timestamp_button'], errors='coerce')
        ratings_features['p_id'] = ratings_features['p_id'].astype(str).str.lstrip('0').astype('int64')


        results = []
        
        if ratings_features is not None:

            # Merge based on p_id
            ratings_features = ratings_features.merge(
                sociodem_subset,
                left_on='p_id',
                right_on='P_ID',
                how='left'
            )

            # Optionally, drop the duplicate 'P_ID' column if not needed
            ratings_features.drop(columns=['P_ID'], inplace=True)

            # Add context_influence and warning_value based on interval_id
            for index, row in ratings_features.iterrows():
                if row['interval_id'] == 1:
                    ratings_features.at[index, 'sociodem_context_influence'] = row['context_positivity_1']
                    ratings_features.at[index, 'sociodem_warning_value'] = row['warning_value_1']
                elif row['interval_id'] == 2:
                    ratings_features.at[index, 'sociodem_context_influence'] = row['context_positivity_2']
                    ratings_features.at[index, 'sociodem_warning_value'] = row['warning_value_2']
                elif row['interval_id'] == 3:
                    ratings_features.at[index, 'sociodem_context_influence'] = row['context_positivity_3']
                    ratings_features.at[index, 'sociodem_warning_value'] = row['warning_value_3']

            ratings_features = ratings_features.drop({'context_positivity_1', 'warning_value_1', 'context_positivity_2', 'warning_value_2', 'context_positivity_3', 'warning_value_3'}, axis=1)


            # Apply mappings and create new columns with "_mapped" suffix
            for col, mapping in column_numerical_mapping.items():
                ratings_features[col + "_mapped"] = ratings_features[col].map(mapping)

            # Calculate BMI using the mapped weight and length
            ratings_features["sociodem_BMI"] = ratings_features["sociodem_weight_mapped"] / ((ratings_features["sociodem_length_mapped"] / 100) ** 2)


            # Save the corrected DataFrame to a new CSV file in the same folder
            output_file_path = os.path.join(folder_path, f"{p_id}_ratingsFeatures_baselcorr_17-3_C1WPCSWS.csv")
            ratings_features.to_csv(output_file_path, index=None)
            print(f"Saved merged features to: {output_file_path}")

Processing folder: C:\Users\BootMR\Documents\data_export\00-code_export
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\00-code_export.
Processing folder: C:\Users\BootMR\Documents\data_export\03
Saved merged features to: C:\Users\BootMR\Documents\data_export\03\03_ratingsFeatures_baselcorr_17-3_C1WPCSWS.csv
Processing folder: C:\Users\BootMR\Documents\data_export\04
ratings_features missing or empty in folder C:\Users\BootMR\Documents\data_export\04.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sociodem_subset.rename(columns=column_name_mapping, inplace=True)


## merge into one

In [26]:
# as prep for adding FB data

## merge all individual files into 1 large



# Initialize a list to store DataFrames
dataframes = []

# Iterate over all subfolders
for subdir, _, files in os.walk(parent_dir):
    # Skip the root directory itself
    if subdir == parent_dir:
        continue
    
    # Extract the subfolder name
    subfolder_name = os.path.basename(subdir)
    
    # Check for files containing "FlirtNkFeatures" in the current subfolder
    for file in files:
        if "ratingsFeatures_baselcorr_17-3_C1WPCSWS" in file:
            file_path = os.path.join(subdir, file)
            try:
                # Read the file into a DataFrame
                df = pd.read_csv(file_path)
                # Append the DataFrame to the list
                dataframes.append(df)
                print(f"Loaded file: {file_path}, Rows: {len(df)}")
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")

# Concatenate all DataFrames
if dataframes:
    merged_df = pd.concat(dataframes, ignore_index=True)
    # Define the root directory and the output file path
    output_file_path = os.path.join(parent_dir, "ratingsFeatures_baselcorr_17-3_C1WPCSWS_merged.csv")
    # Save the merged DataFrame to a CSV file
    merged_df.to_csv(output_file_path, index=False)
    print(f"Saved merged DataFrame to: {output_file_path}, Total Rows: {len(merged_df)}")
else:
    print("No files with 'FlirtNkFeatures' found in any subfolder.")


Loaded file: C:\Users\BootMR\Documents\data_export\03\03_ratingsFeatures_baselcorr_17-3_C1WPCSWS.csv, Rows: 8
Saved merged DataFrame to: C:\Users\BootMR\Documents\data_export\ratingsFeatures_baselcorr_17-3_C1WPCSWS_merged.csv, Total Rows: 8


## add fietsersbond data

In [6]:
### done in arcgis

all_ratingsfeatures = pd.read_csv("ratingsFeatures_baselcorr_17-3_C1WPCSWSFB_merged.csv", sep=";")


In [7]:
## clean after adding FB data

#all_ratingsfeatures = all_ratingsfeatures.drop({'Join_Count', 'TARGET_FID'}, axis=1)

# List of columns to add the prefix 'FB_'
columns_to_prefix = [
    'id', 'van_id', 'naar_id', 'lengte', 'toegang', 'straat', 'has_straat', 'wegnummer', 
    'plaats', 'has_plaats', 'provincie', 'navigatie', 'ongelijkv', 'wegNiveau', 'wegtype', 
    'wegdeksrt', 'wegkwal', 'hinder', 'verlichtin', 'omgeving', 'water', 'schoonheid', 
    'beschrvng', 'extreistij', 'maxsnelhei', 'hoofdroute', 'knooppunt', 'lf', 'routes', 
    'has_routes', 'BiBK', 'wegbeheer', 'strooien', 'snelfiets', 'breedtekls', 'gem_st_fwd', 
    'gem_st_bwd', 'max_st_fwd', 'max_st_bwd', 'toegang_sp', 'lanes_fwd', 'lanes_bwd', 'is_af'
]

# Add the prefix 'FB_' to the selected columns in all_ratingsfeatures
all_ratingsfeatures.rename(columns={col: f"FB_{col}" for col in columns_to_prefix}, inplace=True)



# Define mapping dictionaries
column_FB_numerical_mapping = {
    "FB_wegNiveau": {"belangrijke hoofdweg": 1, "langs hoofdweg": 2, "overige weg": 3},
    "FB_wegtype": {"normale weg": 1, "fietspad (langs weg)": 2, "ventweg": 3, 
                   "weg met fiets(suggestie)strook": 4, "fietsstraat": 5, "solitair fietspad": 6},
    "FB_wegdeksrt": {"tegels": 1, "klinkers": 2, "halfverhard": 3, "asfalt/beton": 4},
    "FB_wegkwal": {"goed": 3, "redelijk": 2},
    "FB_hinder": {"veel": 1, "redelijk": 2, "weinig": 3, "zeer weinig": 4},
    "FB_omgeving": {"bebouwd (weinig of geen groen)": 1, "bebouwd (veel groen)": 2, "bos": 3},
    "FB_schoonheid": {"mooi": 2, "neutraal": 1},
    "FB_maxsnelhei": {"30": 1, "50": 2}
}

# Iterate over each column mapping
for column, mapping in column_FB_numerical_mapping.items():
    # Check if the column exists in the dataframe
    if column in all_ratingsfeatures.columns:
        # Apply the mapping and create new column with _mapped suffix
        mapped_column_name = f"{column}_mapped"
        all_ratingsfeatures[mapped_column_name] = all_ratingsfeatures[column].map(mapping)
    



In [8]:
output_file_path = os.path.join(parent_dir, "ratingsFeatures_baselcorr_17-3_C1WPCSWSFBC_merged.csv")
# Save the merged DataFrame to a CSV file
all_ratingsfeatures.to_csv(output_file_path, index=False)

## renaming


In [None]:
## the following provides input for what should be corrected

rename_mapping = {
    #old:new
    "sociodem_income": "sociodem_income_original",
    "sociodem_education": "sociodem_education_original",
    "sociodem_cycling_experience": "timestamp_sociodem_cycling_experience_mappedwindow_original",
    "sociodem_pleasantness_disposition": "sociodem_pleasantness_disposition_original",

    "sociodem_income_mapped": "sociodem_income",
    "sociodem_education_mapped": "sociodem_education",
    "sociodem_cycling_experience_mapped": "sociodem_cycling_experience",
    "sociodem_pleasantness_disposition_mapped": "sociodem_pleasantness_disposition",

    "sociodem_warning_value": "warnings_warning_value",

    "FB_wegdeksrt_mapped": "context_surface_type",
    "FB_wegkwal_mapped": "context_road_quality",
    "FB_schoonheid_mapped": "context_scenic_beauty",
    "FB_hinder_mapped": "context_hindrance",
    "FB_omgeving_mapped": "context_surroundings",
    "FB_wegtype_mapped": "context_road_type"
}
all_ratings_BEWSFB = all_ratings_BEWSFB.rename(columns=rename_mapping)


'''
#then construct a new list for feature_selection and R

add: 

all_ratings_BEWSFB["context_perceivedinfluence"]
all_ratings_BEWSFB["cycling_perceivedintensity"] 
all_ratings_BEWSFB["sociodem_fitness"] 
all_ratings_BEWSFB["sociodem_mood"] 

into: 

p_id + rating_totalperpid + H10_hr_mean + H10_hrv_mean + H10_HRV_RMSSD + H10_HRV_MeanNN + H10_HRV_HF + H10_HRV_SD1 + H10_HRV_SD2 + E4_eda_phasic_mean + E4_eda_phasic_peaks + E4_eda_phasic_max + E4_eda_phasic_n_above_mean + E4_eda_tonic_mean + E4_eda_tonic_peaks + E4_eda_tonic_n_above_mean + cadence_avg + velocity_avg + velocity_avg_change + warnings_slowdown_count + warnings_tactile_warning + warnings_audio_warning + warnings_warning_value + sociodem_income + sociodem_education + sociodem_income + sociodem_education + sociodem_cycling_experience + sociodem_BMI + sociodem_fitness + sociodem_mood + sociodem_pleasantness_disposition + context_surface_type + context_road_quality + context_scenic_beauty + context_hindrance + context_road_type + weather_rain_3h + weather_wind_speed + context_perceivedinfluence + cycling_perceivedintensity

'''

# Optional: save to new file
all_ratings_BEWSFB.to_csv("ratingsFeatures_baselcorr_17-3_C1WPCSWSFBCNSR_merged.csv", index=False)
