In [1]:
import os
import numpy as np
import pandas as pd
import neurokit2 as nk

## Segment and Save the Physiological Data for Each Participant

This notebook segments each participants physiological data into their 3 seperate sections: baseline, driving, and experiment. 

<div style="text-align:center;">
    <img src='./photos/Segment and Save Physio Data.png' style="width:700px;">
</div>

# EXP2

### Environment Variables

In [2]:
# Paths
exp2_driving_data_folder = "./AdVitam/Exp2/Raw/Driving"
exp2_physio_data_folder = "./AdVitam/Exp2/Raw/Physio/Txt"
exp2_processed_physio_folder_path = "./Physiological Preprocessed/Exp2"

# Participants seem to have issues with there physiological data
exp2_drivers_to_exclude = ["NST77", "NST11", "ST22", "NST87", "ST14", "ST12", "NST73", "ST10"]

# Obstacles
exp2_obstacles = {
    "Obs1": "Deer",
    "Obs2": "Cone",
    "Obs3": "Frog",
    "Obs4": "Can",
    "Obs5": ["FA1", "FA2"],
}

### Comparing Driving and Physiological Data

In [3]:
# Read every file in the Driving folder
driving_files = [file for file in os.listdir(exp2_driving_data_folder) if file.endswith(".txt")]

# Read every file in the Physio folder that is not a -markers file
physio_files = [
    file
    for file in os.listdir(exp2_physio_data_folder)
    if file.endswith(".txt") and not file.endswith("-markers.txt")
]

# Initialize
drivers_to_exclude = []

# Check for missing files
for file in physio_files:
    if file not in driving_files:
        drivers_to_exclude.append(file.replace(".txt", ""))

for file in driving_files:
    if file not in physio_files:
        drivers_to_exclude.append(file.replace(".txt", ""))

# Add exp2 drivers to exclude
drivers_to_exclude.extend(exp2_drivers_to_exclude)

# Remove duplicates and sort the list
drivers_to_exclude = sorted(list(set(drivers_to_exclude)))

print(f"Drivers to exclude: {drivers_to_exclude}")

Drivers to exclude: ['NST11', 'NST73', 'NST77', 'NST87', 'NST91', 'ST10', 'ST12', 'ST14', 'ST22', 'ST60', 'ST84']


## Physiological Data, Markers & Timestamps

### Data Description
- See [Data Sturcture](#advitam-data-structure) for an illustrative overview of the AdVitam Dataset

**Physiological Data:**
| Feature | Description            | Notes  |
| ------- | ---------------------- | ------ |
| min     | Time Elapsed           | mins |
| ECG     | Electrocardiogram      | 1000Hz |
| EDA     | Electrodermal Activity | 1000Hz |
| RESP    | Resperatory            | 1000Hz |

**Markers:**

Contains the timestamps for each period of the experiment.

- Training1 = Baseline phase
- Training2 = Practice phase in the driving simulator
- Driving = Main driving session in conditionally automated driving.

Be careful, the timestamps are here in seconds while they are in minutes in the raw data.

**Driving Timestamps:**

Time elapsed (in seconds) between the start of the main driving session and the appearance of the obstacles.

- TrigObsX: the time when the driver pressed the button to report having understood the situation
- DetObsX: and the time when the driver actually took over control
- RepObsX: X corresponds to one of obstacle or the false alarm.



### Physiological Data

- Store the physio data and markers for each participant in a dictionary

In [4]:
# create dictionary
phsyiological_data_dictionary = {}

# loop through files
for filename in os.listdir(exp2_physio_data_folder):
    # exclude participants
    if (
        filename.replace(".txt", "") in drivers_to_exclude
        or filename.replace("-markers.txt", "") in drivers_to_exclude
    ):
        continue

    # read file
    file_path = os.path.join(exp2_physio_data_folder, filename)

    # markers
    if "-markers" in filename:
        phsyiological_data_dictionary[filename.replace(".txt", "")] = pd.read_csv(
            file_path, header=2, sep="\t"
        )
    # physiological data
    else:
        driver_data = pd.read_csv(
            file_path,
            sep="\t",
            header=9,
            skiprows=[10],
            usecols=[0, 1, 2, 3],
        )

        # add to dictionary
        phsyiological_data_dictionary[filename.replace(".txt", "")] = driver_data

- Return a random participant from a dictionary

In [5]:
random_key = np.random.choice(list(phsyiological_data_dictionary.keys()))
phsyiological_data_dictionary[random_key].head()

Unnamed: 0,Marker Index:,Time(sec.):,Label:
0,Event 1:,436.49,"Training 1 Start, 15:47:19"
1,Event 2:,736.985,"Training 1 End, 15:52:19"
2,Event 3:,849.31,"Training 2 Start, 15:54:11"
3,Event 4:,1149.735,"Training 2 End, 15:59:12"
4,Event 5:,1487.915,"Driving Start, 16:04:50"


---
# Importing Data + Preprocessing

### Importing and Processing Physiological Data

- Segment the data into _Baseline_, _Driving_, and _Experiment_ phases

In [6]:
columns_to_average = [
    "ECG_Rate",
    "ECG_Quality",
    "ECG_Phase_Completion_Atrial",
    "ECG_Phase_Completion_Ventricular",
    "RSP_Amplitude",
    "RSP_Rate",
    "RSP_RVT",
    "RSP_Phase_Completion",
    "RSP_Symmetry_PeakTrough",
    "RSP_Symmetry_RiseDecay",
    "EDA_Tonic",
    "RSA_P2T",
    "RSA_Gates",
]

In [7]:
# storing the marker keys to be removed
marker_keys = []

# loop through each driver
for driver in phsyiological_data_dictionary.keys():
    if driver.endswith("-markers"):
        continue

    # get driver data
    driver_data = phsyiological_data_dictionary[driver]
    markers = phsyiological_data_dictionary[driver + "-markers"]

    # convert to timedelta
    driver_data["min"] = pd.to_timedelta(driver_data["min"], unit="m")
    driver_data = driver_data.rename(columns={"min": "Time"})

    # trim to the experiment
    driver_data = driver_data[
        (driver_data["Time"] >= pd.to_timedelta(markers["Time(sec.):"][0], unit="s"))
        & (driver_data["Time"] <= pd.to_timedelta(markers["Time(sec.):"][5], unit="s"))
    ]

    # Resample the data to 1000 Hz
    driver_data = driver_data.set_index("Time")
    resampled_data = pd.DataFrame()
    for column in driver_data.columns:
        if column in columns_to_average:
            resampled_data[column] = driver_data[column].resample("1ms").mean()
            resampled_data[column] = resampled_data[column].interpolate()
        else:
            resampled_data[column] = driver_data[column].resample("1ms").max()
            resampled_data[column] = resampled_data[column].interpolate(method="ffill")
    driver_data = resampled_data

    # Reset the index
    driver_data = driver_data.reset_index()

    # Preprocessing the data with NeuroKit
    signals, _ = nk.bio_process(
        eda=driver_data["CH1"],
        ecg=driver_data["CH2"],
        rsp=driver_data["CH3"],
        sampling_rate=1000,
    )

    # Add the preprocessed data to the driver data
    driver_data = pd.concat([driver_data, signals], axis=1)

    # Set the time as the index
    driver_data = driver_data.set_index("Time")

    # Baseline Data
    driver_baseline_data = driver_data[
        (driver_data.index >= pd.to_timedelta(markers["Time(sec.):"][0], unit="s"))
        & (driver_data.index <= pd.to_timedelta(markers["Time(sec.):"][1], unit="s"))
    ].copy()

    # Driving Data
    driver_experiment_data = driver_data[
        (driver_data.index >= pd.to_timedelta(markers["Time(sec.):"][4], unit="s"))
        & (driver_data.index <= pd.to_timedelta(markers["Time(sec.):"][5], unit="s"))
    ].copy()

    # reset the index to start from 0 at the beginning of the experiment
    driver_experiment_data.index = driver_experiment_data.index - driver_experiment_data.index[0]

    # replacing the dictionary value with segmented data
    phsyiological_data_dictionary[driver] = {}

    # Save the data
    os.makedirs(exp2_processed_physio_folder_path, exist_ok=True)
    driver_baseline_data.to_csv(
        os.path.join(exp2_processed_physio_folder_path, driver + "_baseline.csv")
    )
    driver_experiment_data.to_csv(
        os.path.join(exp2_processed_physio_folder_path, driver + "_driving.csv")
    )

    # storing the marker keys to be removed
    marker_keys.append(driver + "-markers")

# Delete marker data
for marker_key in marker_keys:
    del phsyiological_data_dictionary[marker_key]

---

# EXP3

## Environment Variables

In [2]:
# Paths
exp3_driving_data_folder = "./AdVitam/Exp3/Raw/Driving"
exp3_physio_data_folder = "./AdVitam/Exp3/Raw/Physio/Txt"
exp3_processed_physio_folder_path = "./Physiological Preprocessed/Exp3"

### Comparing Driving and Physiological Data

In [3]:
# Read every file in the Driving folder
driving_files = [file.replace("_Baseline", "") for file in os.listdir(exp3_driving_data_folder) if file.endswith("_Baseline.txt")]

# Read every file in the Physio folder that is not a -markers file
physio_files = [
    file
    for file in os.listdir(exp3_physio_data_folder)
    if file.endswith(".txt") and not file.endswith("-markers.txt")
]


# Initialize
# drivers_to_exclude = ["LNA48"]
drivers_to_exclude = []

# Check for missing files
for file in physio_files:
    if file not in driving_files:
        drivers_to_exclude.append(file.replace(".txt", ""))

for file in driving_files:
    if file not in physio_files:
        drivers_to_exclude.append(file.replace(".txt", ""))

# Remove duplicates and sort the list
drivers_to_exclude = sorted(list(set(drivers_to_exclude)))

# add additional drivers to exclude
drivers_to_exclude.extend(["NLNA52", "NLNA44", "NLA64", "NLNA33"])

print(f"Drivers to exclude: {drivers_to_exclude}")

Drivers to exclude: ['LA77', 'LNA44', 'LNA64', 'NLA90', 'NLA95']


## Physiological Data, Markers & Timestamps

In [4]:
# create dictionary
phsyiological_data_dictionary = {}

# loop through files
for filename in os.listdir(exp3_physio_data_folder):
    # exclude participants
    if (
        filename.replace(".txt", "") in drivers_to_exclude
        or filename.replace("-markers.txt", "") in drivers_to_exclude
    ):
        continue

    # read file
    file_path = os.path.join(exp3_physio_data_folder, filename)

    # markers
    if "-markers" in filename:
        phsyiological_data_dictionary[filename.replace(".txt", "")] = pd.read_csv(
            file_path, header=5, sep="\t"
        )
    # physiological data
    else:
        driver_data = pd.read_csv(
            file_path,
            sep="\t",
            header=9,
            skiprows=[10],
            usecols=[0, 1, 2, 3],
        )

        # add to dictionary
        phsyiological_data_dictionary[filename.replace(".txt", "")] = driver_data

- Return a random participant from a dictionary

In [5]:
'''Creates a dictionary of the raw physiological data and their markers'''
random_key = np.random.choice(list(phsyiological_data_dictionary.keys()))
phsyiological_data_dictionary[random_key].head()

Unnamed: 0,Marker Index:,Time(sec.):,Label:
0,Event 1:,14.985,"Baseline Start, 17:40:55"
1,Event 2:,315.61,"Baseline End, 17:45:55"
2,Event 3:,404.945,"Training Start, 17:47:25"
3,Event 4:,708.64,"Training End, 17:52:28"
4,Event 5:,1037.33,"Block1 Start, 17:57:57"


---
# Importing Data + Preprocessing

In [6]:
columns_to_average = [
    "ECG_Rate",
    "ECG_Quality",
    "ECG_Phase_Completion_Atrial",
    "ECG_Phase_Completion_Ventricular",
    "RSP_Amplitude",
    "RSP_Rate",
    "RSP_RVT",
    "RSP_Phase_Completion",
    "RSP_Symmetry_PeakTrough",
    "RSP_Symmetry_RiseDecay",
    "EDA_Tonic",
    "RSA_P2T",
    "RSA_Gates",
]

In [7]:
# storing the marker keys to be removed
marker_keys = []

# loop through each driver
for driver in phsyiological_data_dictionary.keys():
    if driver.endswith("-markers"):
        continue

    # skip drivers that are in the preprocessed folder
    if os.path.exists(
        f"{exp3_processed_physio_folder_path}/{driver}_baseline.csv"
    ):
        continue

    # get driver data
    driver_data = phsyiological_data_dictionary[driver]
    markers = phsyiological_data_dictionary[driver + "-markers"]

    # convert to timedelta
    driver_data["min"] = pd.to_timedelta(driver_data["min"], unit="m")
    driver_data = driver_data.rename(columns={"min": "Time"})

    # trim to the experiment
    driver_data = driver_data[
        (driver_data["Time"] >= pd.to_timedelta(markers["Time(sec.):"][0], unit="s"))
        & (driver_data["Time"] <= pd.to_timedelta(markers["Time(sec.):"].iloc[-1], unit="s"))
    ]

    # Resample the data to 1000 Hz
    driver_data = driver_data.set_index("Time")
    resampled_data = pd.DataFrame()
    for column in driver_data.columns:
        if column in columns_to_average:
            resampled_data[column] = driver_data[column].resample("1ms").mean()
            resampled_data[column] = resampled_data[column].interpolate()
        else:
            resampled_data[column] = driver_data[column].resample("1ms").max()
            resampled_data[column] = resampled_data[column].interpolate(method="ffill")
    driver_data = resampled_data

    # Reset the index
    driver_data = driver_data.reset_index()

    # Preprocessing the data with NeuroKit
    signals, _ = nk.bio_process(
        eda=driver_data["CH1"],
        ecg=driver_data["CH2"],
        rsp=driver_data["CH3"],
        sampling_rate=1000,
    )

    # Add the preprocessed data to the driver data
    driver_data = pd.concat([driver_data, signals], axis=1)

    # Set the time as the index
    driver_data = driver_data.set_index("Time")

    # Baseline Data
    driver_baseline_data = driver_data[
        (driver_data.index >= pd.to_timedelta(markers[markers["Label:"].str.contains("Baseline Start")]["Time(sec.):"].values[0], unit="s"))
        & (driver_data.index <= pd.to_timedelta(markers[markers["Label:"].str.contains("Baseline End")]["Time(sec.):"].values[0], unit="s"))
    ].copy()

    # Block Data
    driver_part_1_data = driver_data[
        (driver_data.index >= pd.to_timedelta(markers[markers["Label:"].str.contains("Block1 Start")]["Time(sec.):"].values[0], unit="s"))
        & (driver_data.index <= pd.to_timedelta(markers[markers["Label:"].str.contains("Block2 End")]["Time(sec.):"].values[0], unit="s"))
    ].copy()

    # Block Data
    driver_part_2_data = driver_data[
        (driver_data.index >= pd.to_timedelta(markers[markers["Label:"].str.contains("Block3 Start")]["Time(sec.):"].values[0], unit="s"))
        & (driver_data.index <= pd.to_timedelta(markers[markers["Label:"].str.contains("Block5 End")]["Time(sec.):"].values[0], unit="s"))
    ].copy()

    # reset the index to start from 0 at the beginning of the experiment
    driver_part_1_data.index = driver_part_1_data.index - driver_part_1_data.index[0]
    driver_part_2_data.index = driver_part_2_data.index - driver_part_2_data.index[0]

    # replacing the dictionary value with segmented data
    phsyiological_data_dictionary[driver] = {}

    os.makedirs(exp3_processed_physio_folder_path, exist_ok=True)
    driver_baseline_data.to_csv(
        f"{exp3_processed_physio_folder_path}/{driver}_baseline.csv"
    )
    driver_part_1_data.to_csv(
        f"{exp3_processed_physio_folder_path}/{driver}_driving_part1.csv"
    )
    driver_part_2_data.to_csv(
        f"{exp3_processed_physio_folder_path}/{driver}_driving_part2.csv"
    )

    # storing the marker keys to be removed
    marker_keys.append(driver + "-markers")

# Delete marker data
for marker_key in marker_keys:
    del phsyiological_data_dictionary[marker_key]