# Preamble


Python Libraries


In [1]:
# import os
# import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing

Custom Functions


In [2]:
from useful_functions.dd_dictionary import create_dd_dictionary
from useful_functions.pd_dictionary import create_pd_dictionary
from useful_functions.takeover_dataframe import create_takeover_timestamps
from useful_functions.check_for_missing_data import check_for_missing_data

<br></br>

# Importing Data + PreProcessing

---

### Raw data folders

In [3]:
driving_data_folder = "../AdVitam/Exp2/Raw/Driving"
physio_data_folder = "../AdVitam/Exp2/Raw/Physio/Txt"

### Participants to Exclude

| Participant | Reason |
| --- | --- |
| NST77 | Driving file contains obstacles = "TriggeredObs2TriggeredObs3" and "TriggeredObs3TriggeredObs4" |
| NST91, ST84, ST60 | Does not contain a physio file |

In [4]:
check_for_missing_data(driving_data_folder, physio_data_folder)

['NST91.txt', 'ST84.txt', 'ST60.txt']

In [5]:
participants_to_exclude = ["NST77", 'NST91', 'ST84', 'ST60']

<br></br>

## Driving Data

---

### Data Description

| Feature | Description | Notes |
| --- | --- | --- |
| Time | Time elapsed since the software was launched (in seconds) |  |
| EngineSpeed | Engine speed (in rpm) | Removed |
| GearPosActual | Current gear | Removed |
| GearPosTarget | Next planned gear | Removed |
| AcceleratorPedalPos | Position of gas pedal. | Recording problem, Removed |
| DeceleratorPedalPos | Position of brake pedal. | Recording problem, Removed |
| SteeringWheelAngle | Steering wheel angle (in degrees) |  |
| VehicleSpeed | Vehicle speed (in mph) |  |
| Position X | Vehicle position along the x-axis in the simulated driving environment |  |
| Position Y | Vehicle position along the y-axis in the simulated driving environment |  |
| Position Z | Vehicle position along the z-axis in the simulated driving environment |  |
| Autonomous Mode (T/F) | Autonomous pilot status. | True = autonomous pilot activated, False = autonomous pilot deactivated (driver in control of the car) |
| Obstacles | Events that occurred during the driving simulation. | See Below |

- Obstacles: Events that occurred during the driving simulation.
  - TriggeredObsX = Time at which each takeover request was triggered by the experimenter..
  - Obs1 = deer, Obs2 = traffic cone, Obs3 = frog, Obs4 = traffic cone, Obs5 = false alarm (x2).
  - Detected = Time at which the driver pressed the steering wheel button to notify he/she understood the situation. The driver is in control of the car when the value of the column "Autonomous Mode (T/F)" is False.

### Driving Data Dictionary

In [6]:
driving_data_dictionary = create_dd_dictionary(
    driving_data_folder, participants_to_exclude
)

### Processing driving data

Steps Taken
1. Label encode the `Obstacles` column
2. Convert `Time` column to pandas timedelta
2. Resample driver data to 10ms

In [7]:
# Fitting a Label Encoder to the Obstacles
driver_data =driving_data_dictionary['NST01']
enc = preprocessing.LabelEncoder()
enc.fit(driver_data["Obstacles"])

# Preprocessing the driving data
for driver in driving_data_dictionary.keys():
    driver_data = driving_data_dictionary[driver]

    # label encoding
    driver_data["Obstacles"] = enc.transform(driver_data["Obstacles"])

    # resampling
    driver_data["Time"] = pd.to_timedelta(driver_data["Time"], unit="s")
    driver_data = driver_data.drop_duplicates(subset="Time")
    driver_data = driver_data.set_index("Time")
    driver_data = driver_data.resample("10ms").ffill()
    driver_data = driver_data.reset_index()

    # replacing the dictionary value
    driving_data_dictionary[driver] = driver_data

### Creating driving data takeover timestamps

In [8]:
driving_timestamps = create_takeover_timestamps(driving_data_dictionary, enc)
driving_timestamps.head()

Unnamed: 0,index,TriggeredObs1,TakeoverObs1,ReleaseObs1,TOTObs1,TriggeredObs2,TakeoverObs2,ReleaseObs2,TOTObs2,TriggeredObs3,...,ReleaseObs3,TOTObs3,TriggeredObs4,TakeoverObs4,ReleaseObs4,TOTObs4,TriggeredObs5,TakeoverObs5,ReleaseObs5,TOTObs5
0,NST01,0 days 00:05:11.974200,0 days 00:05:18.804200,0 days 00:05:28.764200,0 days 00:00:06.830000,0 days 00:09:11.494200,0 days 00:09:13.964200,0 days 00:09:23.654200,0 days 00:00:02.470000,0 days 00:10:50.094200,...,0 days 00:10:54.554200,0 days 00:00:04.080000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
1,ST02,0 days 00:08:03.979300,0 days 00:08:08.999300,0 days 00:08:17.339300,0 days 00:00:05.020000,0 days 00:06:03.149300,0 days 00:06:06.569300,0 days 00:06:09.769300,0 days 00:00:03.420000,0 days 00:14:38.599300,...,0 days 00:14:44.779300,0 days 00:00:04.560000,0 days 00:17:24.939300,0 days 00:17:29.289300,0 days 00:17:33.199300,0 days 00:00:04.350000,NaT,NaT,NaT,NaT
2,NST03,0 days 00:16:04.013200,0 days 00:16:08.633200,0 days 00:16:41.013200,0 days 00:00:04.620000,0 days 00:12:48.623200,0 days 00:12:51.843200,0 days 00:13:24.443200,0 days 00:00:03.220000,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
3,ST04,0 days 00:19:23.934300,0 days 00:19:36.624300,0 days 00:19:54.174300,0 days 00:00:12.690000,0 days 00:13:29.504300,0 days 00:13:32.174300,0 days 00:13:39.614300,0 days 00:00:02.670000,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
4,NST05,0 days 00:10:02.164780,0 days 00:10:04.474780,0 days 00:10:06.294780,0 days 00:00:02.310000,0 days 00:15:16.364780,0 days 00:15:31.474780,0 days 00:15:36.064780,0 days 00:00:15.110000,0 days 00:17:52.614780,...,0 days 00:18:00.284780,0 days 00:00:03.340000,0 days 00:07:21.274780,0 days 00:07:24.604780,0 days 00:07:27.654780,0 days 00:00:03.330000,NaT,NaT,NaT,NaT


<br></br>

## Physiological Signals & Markers

---

### Data Description

**Signals:**

| Feature | Description | Notes |
| --- | --- | --- |
| min | Time Elapsed |  |
| ECG | Electrocardiogram | 1000Hz |
| EDA | Electrodermal Activity | 1000Hz |
| RESP | Resperatory | 1000Hz |

<br></br>

**Markers:**

Contains the timestamps for each period of the experiment.

- Training1 = Baseline phase
- Training2 = Practice phase in the driving simulator
- Driving = Main driving session in conditionally automated driving.

Be careful, the timestamps are here in seconds while they are in minutes in the raw data.

<br></br>

**Timestamps:**

Time elapsed (in seconds) between the start of the main driving session and the appearance of the obstacles.
- TrigObsX: the time when the driver pressed the button to report having understood the situation 
- DetObsX: and the time when the driver actually took over control 
- RepObsX: X corresponds to one of obstacle or the false alarm.

### Physio data dictionary

In [9]:
phsyiological_data_dictionary = create_pd_dictionary(
    physio_data_folder, participants_to_exclude
)

### Physio timestamps

In [10]:
physio_timestamps = pd.read_csv(
    "../AdVitam/Exp2/Preprocessed/Physio and Driving/timestamps_obstacles.csv"
)

### Processing the Physio Timestamps
Steps: 
1. Change column names to match driving timestamps
1. Remove preselected participants
2. Reformat subject id to match
3. Transfrom timestamps into timedelta objects

In [11]:
# change columns from Trig to Triggered
for col in physio_timestamps.columns:
    if "Trig" in col:
        physio_timestamps = physio_timestamps.rename(
            columns={col: col.replace("Trig", "Triggered")}
        )

    # remove
    if "Det" in col:
        physio_timestamps = physio_timestamps.drop(columns=col)

    if "Rep" in col:
        physio_timestamps = physio_timestamps.rename(
            columns={col: col.replace("Rep", "Takeover")}
        )

# Remove the participants that are not in the driving data
physio_timestamps = physio_timestamps[
    ~physio_timestamps["subject_id"].isin(participants_to_exclude)
]

# Add 0 to the subject ids to match the format of the driving data
physio_timestamps["subject_id"] = physio_timestamps["subject_id"].apply(
    lambda x: x.split("T")[0] + "T" + x.split("T")[1].zfill(2)
)

# transform the every column to a timedelta
for timestamp in physio_timestamps.columns:
    if timestamp != "subject_id" and timestamp != "label_st":
        # Check value is not NaT
        physio_timestamps[timestamp] = physio_timestamps[timestamp].apply(
            lambda x: pd.to_timedelta(x, unit="s") if pd.notnull(x) else x
        )

# Adding the takeover times to the timestamp data
trigger = "TriggeredObs"
respond = "TakeoverObs"
obstacles = ["Deer", "Cone", "Frog", "Can", "FA1", "FA2"]

for obstacle in obstacles:
    physio_timestamps["TOT" + "Obs" + obstacle] = (
        physio_timestamps[respond + obstacle] - physio_timestamps[trigger + obstacle]
    )

for col in physio_timestamps.columns:
    for i, obstacle in enumerate(obstacles):
        if obstacle in col:
            physio_timestamps = physio_timestamps.rename(
                columns={col: col.replace(obstacle, str(i + 1))}
            )

physio_timestamps.head()

Unnamed: 0,subject_id,label_st,TriggeredObs1,TakeoverObs1,TriggeredObs2,TakeoverObs2,TriggeredObs3,TakeoverObs3,TriggeredObs4,TakeoverObs4,TriggeredObs5,TakeoverObs5,TriggeredObs6,TakeoverObs6,TOTObs1,TOTObs2,TOTObs3,TOTObs4,TOTObs5,TOTObs6
0,NST01,0,0 days 00:02:56.705100,0 days 00:03:03.523800,0 days 00:06:56.214000,0 days 00:06:58.690200,0 days 00:08:34.815700,NaT,0 days 00:13:06.640800,NaT,0 days 00:16:23.624000,NaT,0 days 00:18:02.245000,NaT,0 days 00:00:06.818700,0 days 00:00:02.476200,NaT,NaT,NaT,NaT
1,ST02,1,0 days 00:03:50.756500,0 days 00:03:55.778000,0 days 00:01:49.933400,0 days 00:01:53.351600,0 days 00:10:25.382700,0 days 00:10:29.941600,0 days 00:13:11.720300,0 days 00:13:16.067300,0 days 00:05:57.714400,NaT,0 days 00:07:48.515800,NaT,0 days 00:00:05.021500,0 days 00:00:03.418200,0 days 00:00:04.558900,0 days 00:00:04.347000,NaT,NaT
2,NST03,0,0 days 00:13:35.204000,0 days 00:13:39.824400,0 days 00:10:19.808800,0 days 00:10:23.032200,0 days 00:04:19.471200,NaT,0 days 00:17:07.400900,NaT,0 days 00:06:18.340900,NaT,0 days 00:18:35.174900,NaT,0 days 00:00:04.620400,0 days 00:00:03.223400,NaT,NaT,NaT,NaT
3,ST04,1,0 days 00:17:20.361900,0 days 00:17:33.047900,0 days 00:11:25.928100,NaT,0 days 00:04:47.372400,NaT,0 days 00:01:59.926600,NaT,0 days 00:06:50.988200,NaT,0 days 00:14:46.936900,NaT,0 days 00:00:12.686000,NaT,NaT,NaT,NaT,NaT
4,NST05,0,0 days 00:07:08.961300,0 days 00:07:11.272600,0 days 00:12:23.166400,0 days 00:12:38.273600,0 days 00:14:59.418600,0 days 00:15:02.753600,0 days 00:04:28.071600,0 days 00:04:31.402700,0 days 00:02:23.631400,NaT,0 days 00:10:29.173600,NaT,0 days 00:00:02.311300,0 days 00:00:15.107200,0 days 00:00:03.335000,0 days 00:00:03.331100,NaT,NaT


### Processing the Physiological data

Steps:
1. Resampled data to 10ms (100Hz)
2. Trimmed the data down to each experimental phase (Baseline, Training, Driving)

In [12]:
marker_keys = []

for driver in phsyiological_data_dictionary.keys():
    if driver.endswith("-markers"):
        continue

    driver_data = phsyiological_data_dictionary[driver]
    markers = phsyiological_data_dictionary[driver + "-markers"]

    # converting the time to timedelta
    driver_data["min"] = pd.to_timedelta(driver_data["min"], unit="min")

    # resampling
    # driver_data = driver_data.drop_duplicates(subset="min")
    driver_data = driver_data.set_index("min")
    driver_data = driver_data.resample("10ms").ffill()
    driver_data = driver_data.reset_index()

    # Change min to Time
    driver_data = driver_data.rename(columns={"min": "Time"})

    # Baseline
    baseline_start = pd.to_timedelta(markers["Time(sec.):"][0], unit="s")
    baseline_end = pd.to_timedelta(markers["Time(sec.):"][1], unit="s")

    driver_baseline_data = driver_data[
        (driver_data["Time"] >= baseline_start) & (driver_data["Time"] <= baseline_end)
    ].copy()

    # Training
    training_start = pd.to_timedelta(markers["Time(sec.):"][2], unit="s")
    training_end = pd.to_timedelta(markers["Time(sec.):"][3], unit="s")

    driver_training_data = driver_data[
        (driver_data["Time"] >= training_start) & (driver_data["Time"] <= training_end)
    ].copy()

    # Driving
    driving_start = pd.to_timedelta(markers["Time(sec.):"][4], unit="s")
    driving_end = pd.to_timedelta(markers["Time(sec.):"][5], unit="s")

    driver_driving_data = driver_data[
        (driver_data["Time"] >= driving_start) & (driver_data["Time"] <= driving_end)
    ].copy()

    # # Adding an 'Obstacles' column
    # ----------------------------------
    # driver_driving_data["Obstacles"] = "Nothing"

    # # Match the timestamps with the obstacles
    # driver_physio_timestamps = physio_timestamps[
    #     physio_timestamps["subject_id"] == driver
    # ]

    # obstacles = driver_physio_timestamps.columns
    # obstacles = obstacles[2:]
    # obstacles = obstacles[:-1]

    # Add an obstacle column to the driving data
    # driver_driving_data["Obstacles"] = "Nothing"
    # for obstacle in obstacles:
    #     # Time when the obstacle appears
    #     obstacle_appears = (
    #         driving_start
    #         + pd.to_timedelta(driver_physio_timestamps[obstacle], unit="s").to_list()[0]
    #     )

    #     # Add this marker to the Obstacles column
    #     if not pd.isna(obstacle_appears):
    #         mask = driver_driving_data["Time"] >= obstacle_appears
    #         first_index = mask.idxmax()
    #         driver_driving_data.at[first_index, "Obstacles"] = obstacle
    # ----------------------------------

    # replacing the dictionary value
    phsyiological_data_dictionary[driver] = {
        "baseline": driver_baseline_data,
        "training": driver_training_data,
        "driving": driver_driving_data,
    }

    # storing the marker keys to be removed
    marker_keys.append(driver + "-markers")

# Delete marker data
for marker_key in marker_keys:
    del phsyiological_data_dictionary[marker_key]

<br></br>

## Driver Demographic Data


---

### Data Description

| Feature | Description | Note |
| --- | --- | --- |
| code | Code of participant Secondary Task (ST) vs No ST (NST) + unique id (1,2,...) | In the form (ST/NST)# |
| date | Day of data collection |  Removed |
| time | Hour of data collection | Removed |
| condition | Experimental condition for mental workload | Removed (contained in participant code |
| sex | Participant sex | |
| age | Age of participants in years | |
| mothertongue | Participants first language | |
| education | Highest education degree | |
| driving_license | Year of obtenstion of driving license | |
| km_year | Number of kilometers covered per year in average | |
| accidents | Number of accidents during the last 3 years | |
| nasa_tlx_N | Answer to the NASA TLX for question N | Removed |
| danger_O | Subjective ranking of the danger of obstacle O | Removed |
| realism_O | Subjective ranking of the realism of obstacle O | Removed |
| sart_N_O | Subjective answer to the sart for question N related to obstacle O | Removed |
| demand_O | Demands on attentional resources (complexity, variability, and instability of the situation) | Removed |
| supply_O | Supply of attentional resources (division of attention, arousal, concentration, and spare mental capacity) | Removed |
| understanding_O | Understanding of the situation (information quantity, information quality and familiarity). |  Removed |


### Driver Demographic Data

In [13]:
driver_demographic_data = pd.read_csv(
    "../AdVitam/Exp2/Preprocessed/Questionnaires/Exp2_Database.csv",
    usecols=[
        'code',
        'sex',
        'age',
        'mothertongue',
        'education',
        'driving_license',
        'km_year',
        'accidents',
    ],
)

### Processing driver demographic data

Steps:
1. Remove preselected participants
2. Reformat code to match data
3. Coverting driving licence from year obtained to  of years obtained
4. Normalize km/y

In [14]:
# Remove the participants that are not in the driving data
driver_demographic_data = driver_demographic_data[
    ~driver_demographic_data["code"].isin(participants_to_exclude)
]
# Reformat code
driver_demographic_data["code"] = driver_demographic_data["code"].apply(
    lambda x: x.split("T")[0] + "T" + x.split("T")[1].zfill(2)
)
# Convert from year to number of years
driver_demographic_data['driving_license']  = 2018 - driver_demographic_data['driving_license']

# Normalize the age and km_year?

driver_demographic_data.head()

Unnamed: 0,code,sex,age,mothertongue,education,driving_license,km_year,accidents
0,NST01,1,19,1,1,1,200,1
1,ST02,1,19,1,1,1,5000,0
2,NST03,1,19,1,1,1,1000,0
3,ST04,1,21,3,2,2,1500,0
4,NST05,1,22,1,1,1,1500,0


<br></br>

# Defining Takeover Quality Quantitatively

- Takeover Time (TOT)
- Sudden Vehicle Deviation
- Response Budget


### Takeover Time

In [15]:
# data
nst01_driving_data = driving_data_dictionary["NST01"]
nst01_physio_data = phsyiological_data_dictionary["NST01"]["driving"]

# timestamps
nst01_driving_timestamp = driving_timestamps[driving_timestamps["index"] == "NST01"]
nst01_physio_timestamp = physio_timestamps[physio_timestamps["subject_id"] == "NST01"]

# store the obstacle triggers for driving and physio
driving_obstacle_trigger = nst01_driving_timestamp["TriggeredObs1"].iloc[0]
physio_obstacle_trigger = nst01_physio_timestamp["TriggeredObs1"].iloc[0]

In [16]:
nst01_physio_data.Time.min()

Timedelta('0 days 00:16:13.690000')

In [17]:
physio_obstacle_trigger

Timedelta('0 days 00:02:56.705100')

In [18]:
# trim the data to the 10s before the takeover
nst01_driving_data = nst01_driving_data[
    (nst01_driving_data["Time"] >= driving_obstacle_trigger - pd.to_timedelta("10s"))
    & (nst01_driving_data["Time"] < driving_obstacle_trigger)
]

nst01_physio_data = nst01_physio_data[
    (
        nst01_physio_data["Time"]
        >= nst01_physio_data.Time.min()
        + physio_obstacle_trigger
        - pd.to_timedelta("10s")
    )
    & (
        nst01_physio_data["Time"]
        < nst01_physio_data.Time.min() + physio_obstacle_trigger
    )
]

In [19]:
nst01_driving_data

Unnamed: 0,Time,SteeringWheelAngle,VehicleSpeed,Position X,Position Y,Position Z,Autonomous Mode (T/F),Obstacles
19993,0 days 00:05:01.974200,0.0,53.3486,-1932.477,1287.972,-1218.088,True,1
19994,0 days 00:05:01.984200,0.0,53.3486,-1932.477,1287.972,-1218.088,True,1
19995,0 days 00:05:01.994200,0.0,53.3517,-1932.191,1287.968,-1218.011,True,1
19996,0 days 00:05:02.004200,0.0,53.3517,-1932.191,1287.968,-1218.011,True,1
19997,0 days 00:05:02.014200,0.0,53.3547,-1931.905,1287.965,-1217.935,True,1
...,...,...,...,...,...,...,...,...
20988,0 days 00:05:11.924200,-3.0,53.1703,-1789.990,1286.278,-1179.032,True,1
20989,0 days 00:05:11.934200,-3.0,53.1703,-1789.990,1286.278,-1179.032,True,1
20990,0 days 00:05:11.944200,-3.0,53.1725,-1789.702,1286.278,-1178.967,True,1
20991,0 days 00:05:11.954200,-3.0,53.1725,-1789.702,1286.278,-1178.967,True,1


In [20]:
nst01_physio_data

Unnamed: 0,Time,CH1,CH2,CH3
114040,0 days 00:19:00.400000,23.243450,-0.445913,-1.017482
114041,0 days 00:19:00.410000,23.249300,-0.410004,-1.004943
114042,0 days 00:19:00.420000,23.253469,-0.381540,-0.997033
114043,0 days 00:19:00.430000,23.253392,-0.416590,-0.989736
114044,0 days 00:19:00.440000,23.251583,-0.428213,-0.986633
...,...,...,...,...
115035,0 days 00:19:10.350000,26.846450,-0.040054,-1.768774
115036,0 days 00:19:10.360000,26.850775,-0.046844,-1.749624
115037,0 days 00:19:10.370000,26.851150,-0.079447,-1.740417
115038,0 days 00:19:10.380000,26.859025,0.047277,-1.734009
