# Settings

## Constant

In [1]:
import os

PATH_DATA='./TD2D'
sensor_data = ['PPG.csv', 'ECG.csv', 'EDA.csv', 'HR.csv', 'diameters.csv', 'fixations.csv', 'gazePositions.csv']

## Utility Functions

In [2]:
import pandas as pd
import numpy as np

# Dataset Overview

## preproccessing

In [3]:
label = pd.DataFrame()

with os.scandir(PATH_DATA) as root:
    for pid in root: #D1~D50
        driver = os.path.basename(pid)[1:]

        if driver.__contains__('idea'):
            continue
        elif driver.__contains__('xlsx'):
            continue
        elif driver.__contains__('DS_Store'):
            continue

        if os.path.isdir(pid):
            with os.scandir(pid) as p:
                label_driver = pd.DataFrame()
                for task in p:  # e.g., audiobook listening
                    if os.path.isdir(task):
                        with os.scandir(task) as t:
                            secondary_task = os.path.basename(task)
                            #참조할 cases information csv file
                            scenario_information = pd.read_csv(os.path.abspath(task)+'/takeoverScenarioInformations.csv', sep=',')
                            #label
                            label_task = pd.DataFrame([[driver, secondary_task]])
                            label_task = pd.concat([label_task, scenario_information[['takeoverResult','reactionTime', 'NASA-TLX']]], axis=1, ignore_index=True)
                            label_driver = pd.concat([label_driver, label_task], axis=0, ignore_index=True)
                label = pd.concat([label, label_driver], axis=0, ignore_index=True)
label.columns = ['driver', 'secondaryTask', 'takeoverResult', 'reactionTime', 'NASA_TLX']


## Descriptive Statistics

In [4]:

# Define secondary task categories
def categorize_task(task):
    if task == 'baseline':
        return '<Without secondary task>'
    elif task in ['0back', '1back', '2back', 'audiobook listening', 'auditory gaming', 'auditory texting']:
        return '<Auditory tasks>'
    elif task in ['ebook reading', 'texting', 'gaming']:
        return '<Visual tasks>'
    return '<Unknown>'

label['Category'] = label['secondaryTask'].apply(categorize_task)

# Calculate success ratio
success_ratios = label.groupby('secondaryTask').apply(lambda x: (x['takeoverResult'] == 'Success').mean() * 100)

# Calculate means and standard deviations for reaction time and workload
stats = label.groupby('secondaryTask').agg(
    reactionTime_Mean=('reactionTime', 'mean'),
    reactionTime_SD=('reactionTime', 'std'),
    workload_Mean=('NASA_TLX', 'mean'),
    workload_SD=('NASA_TLX', 'std')
)

# Merge success ratios
stats = stats.join(success_ratios.rename('successRatio'))

# Add category information to the grouped statistics
stats = stats.reset_index().merge(label[['secondaryTask', 'Category']].drop_duplicates(), on='secondaryTask').set_index('secondaryTask')

# Reorder columns
stats = stats[['Category', 'successRatio', 'reactionTime_Mean', 'reactionTime_SD', 'workload_Mean', 'workload_SD']]

# Set custom order for secondary tasks
task_order = [
    'baseline', '0back', '1back', '2back', 'audiobook listening',
    'auditory texting', 'auditory gaming', 'ebook reading', 'texting', 'gaming'
]
stats.index = pd.CategoricalIndex(stats.index, categories=task_order, ordered=True)
stats = stats.sort_index()

# Define a function to format the output like the provided table
def format_stats_table_with_headers(label):
    table = "Secondary Task                  Success Ratio (%)   Reaction Time (ms)        Perceived Workload\n"
    table += "                                                    Mean       SD             Mean     SD\n"
    current_category = None
    for index, row in label.iterrows():
        if row['Category'] != current_category:
            current_category = row['Category']
            table += f"{current_category}\n"
        table += f"{index:<30} {row['successRatio']:>5.1f}                {row['reactionTime_Mean']:>6.2f}     {row['reactionTime_SD']:>6.2f}        {row['workload_Mean']:>6.2f}    {row['workload_SD']:>6.2f}\n"
    return table

# Format the table and print it
formatted_stats_table = format_stats_table_with_headers(stats)
print(formatted_stats_table)

Secondary Task                  Success Ratio (%)   Reaction Time (ms)        Perceived Workload
                                                    Mean       SD             Mean     SD
<Without secondary task>
baseline                        86.0                880.98     201.80         29.60     22.99
<Auditory tasks>
0back                           82.0                883.06     235.94         33.37     21.18
1back                           68.0                955.50     253.31         51.73     20.09
2back                           72.0                978.62     209.32         66.91     16.44
audiobook listening             76.0                905.26     213.36         38.28     18.95
auditory texting                68.0                1016.90     264.46         45.81     20.92
auditory gaming                 74.0                980.86     281.65         41.99     21.83
<Visual tasks>
ebook reading                   36.0                1294.30     483.59         71.25     15.06
te

In [5]:
driver_info_df = pd.read_csv(PATH_DATA +'/driverInformation.csv')

# add 'age_group' column
driver_info_df['age_group'] = pd.cut(driver_info_df['age'], 
                                     bins=[19, 29, 39, 49, 59, 69], 
                                     labels=['20s', '30s', '40s', '50s', '60s'])

# Create a DataFrame that includes age group information for drivers
age_group_label = driver_info_df[['driverNumber', 'age_group']].copy()

# convert data type of 'driver' & 'driverNumber' column
label['driver'] = label['driver'].astype(int)
age_group_label.loc[:, 'driverNumber'] = age_group_label['driverNumber'].astype(int)

# add age_group colomn to 'label'
label = label.merge(age_group_label, how='left', left_on='driver', right_on='driverNumber')
label.drop(columns=['driverNumber'], inplace=True)

In [6]:
# Grouped statistics will be stored for each age group
grouped_stats = {}

# Loop through each age group and create sub_label for each group
for age_group, group_data in label.groupby('age_group'):
    # sub_label create
    sub_label = group_data.copy()

    # Calculate success ratio
    success_ratios = sub_label.groupby('secondaryTask').apply(lambda x: (x['takeoverResult'] == 'Success').mean() * 100)

    # Calculate means and standard deviations for reaction time and workload
    stats = sub_label.groupby('secondaryTask').agg(
        reactionTime_Mean=('reactionTime', 'mean'),
        reactionTime_SD=('reactionTime', 'std'),
        workload_Mean=('NASA_TLX', 'mean'),
        workload_SD=('NASA_TLX', 'std')
    )

    # Merge success ratios
    stats = stats.join(success_ratios.rename('successRatio'))

    # Add category information to the grouped statistics
    stats = stats.reset_index().merge(sub_label[['secondaryTask', 'Category']].drop_duplicates(), on='secondaryTask').set_index('secondaryTask')

    # Reorder columns
    stats = stats[['Category', 'successRatio', 'reactionTime_Mean', 'reactionTime_SD', 'workload_Mean', 'workload_SD']]

    # Set custom order for secondary tasks
    task_order = [
        'baseline', '0back', '1back', '2back', 'audiobook listening',
        'auditory texting', 'auditory gaming', 'ebook reading', 'texting', 'gaming'
    ]
    stats.index = pd.CategoricalIndex(stats.index, categories=task_order, ordered=True)
    stats = stats.sort_index()

    # Save the formatted stats table for the current age group
    grouped_stats[age_group] = format_stats_table_with_headers(stats)

# Print the formatted statistics table for each age group
for age_group, table in grouped_stats.items():
    print(f"\n=== Age Group: {age_group} ===\n")
    print(table)


=== Age Group: 20s ===

Secondary Task                  Success Ratio (%)   Reaction Time (ms)        Perceived Workload
                                                    Mean       SD             Mean     SD
<Without secondary task>
baseline                        90.0                858.20     196.71         21.03     19.79
<Auditory tasks>
0back                          100.0                838.20     235.71         33.73     17.10
1back                           80.0                865.40     178.93         55.73     20.26
2back                           90.0                1036.40     269.77         69.07     15.23
audiobook listening             70.0                882.80     170.69         43.63     14.51
auditory texting                90.0                1003.20     155.20         49.37     20.32
auditory gaming                 80.0                926.50     213.42         53.73     17.77
<Visual tasks>
ebook reading                   40.0                1399.30     469.20 

# Preprocessing and Feature Extraction

In [7]:
def _extract(data: pd.DataFrame): # return features as a dataframe
    features = pd.DataFrame([[data.min(), data.max(), data.mean(), data.skew(), data.kurtosis()]])
    return features
FEATURE_NUMBER = 5 # min, max, mean, skewness, kurtosis
VALUE_NUMBER = 13 # sum of values
DATA_DURATION = 10000 # 10 s

feature = pd.DataFrame()
label = pd.DataFrame()

with os.scandir(PATH_DATA) as root:
    for pid in root: #D1~D50
        driver = os.path.basename(pid)[1:]

        if driver.__contains__('idea'):
            continue
        elif driver.__contains__('xlsx'):
            continue
        elif driver.__contains__('DS_Store'):
            continue

        if os.path.isdir(pid):
            with os.scandir(pid) as p:
                feature_driver = pd.DataFrame()
                label_driver = pd.DataFrame()
                for task in p:  # 01_task_name
                    if os.path.isdir(task):
                        with os.scandir(task) as t:
                            # takeoverScenarioMeasurements csv file
                            takeoverScenarioMeasurements = pd.read_csv(os.path.abspath(task)+'/takeoverScenarioInformations.csv', sep=',')
                            critical_event_occurrence_time = takeoverScenarioMeasurements.criticalEventOccurrenceTime.values[0]

                            # Dataframe for feature extract
                            feature_task = pd.DataFrame([driver], columns=['driver'])

                            # Label
                            label_task = pd.DataFrame([driver], columns=['driver'])
                            label_task = pd.concat([label_task, takeoverScenarioMeasurements[['takeoverResult', 'reactionTime', 'NASA-TLX']]], axis=1, ignore_index=True)

                            csvs = ['fixations', 'gazePositions', 'diameters', 'HR', 'PPG', 'ECG', 'EDA']
                            for csv in t:   # e.g., ECG.csv
                                f = os.path.basename(csv)
                                if sensor_data.__contains__(f):
                                    file = pd.read_csv(csv, sep=',')
                                    
                                    #selecting 10 seconds before critical event occurrence 
                                    idx_not_selected = file[(file['timestamp'] < (float(critical_event_occurrence_time) - DATA_DURATION)) | (file['timestamp'] > float(critical_event_occurrence_time))].index
                                    file = file.drop(idx_not_selected)

                                    if file.empty:
                                        continue

                                    # get feature
                                    if f == 'fixations.csv':
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 1])], axis=1, ignore_index=True)
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 2])], axis=1, ignore_index=True)
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 3])], axis=1, ignore_index=True)
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 4])], axis=1, ignore_index=True)
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 5])], axis=1, ignore_index=True)
                                    elif f == 'gazePositions.csv':
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 1])], axis=1, ignore_index=True)
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 2])], axis=1, ignore_index=True)
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 3])], axis=1, ignore_index=True)
                                    elif f == 'diameters.csv':
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 1])], axis=1, ignore_index=True)
                                    elif f == 'HR.csv':
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 1])], axis=1, ignore_index=True)
                                    elif f == 'PPG.csv':
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 1])], axis=1, ignore_index=True)
                                    elif f == 'ECG.csv':
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 1])], axis=1, ignore_index=True)
                                    elif f == 'EDA.csv':
                                        feature_task = pd.concat([feature_task, _extract(file.iloc[:, 1])], axis=1, ignore_index=True)
                            feature_driver = pd.concat([feature_driver, feature_task], axis=0, ignore_index=True)
                            label_driver = pd.concat([label_driver, label_task], axis=0, ignore_index=True)
                # if remove this if condition, data will contain nan value and make 50 participants features.
                if (not feature_driver.isna().values.any()) & (len(feature_driver.columns) == 1+VALUE_NUMBER*FEATURE_NUMBER):
                    feature = pd.concat([feature, feature_driver], axis=0, ignore_index=True)
                    label = pd.concat([label, label_driver], axis=0, ignore_index=True)
label.columns = ['driver', 'takeoverResult', 'reactionTime', 'NASA-TLX']

# Model Building and Evaluation

In [8]:
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE, RFECV
from sklearn.neural_network import MLPClassifier

RANDOM_STATE = 42
MAX_DEPTH = None

ESTIMATOR_DUMMY_CLF = DummyClassifier(strategy='most_frequent')
ESTIMATOR_RF_CLF = RandomForestClassifier(random_state=RANDOM_STATE)
ESTIMATOR_XGB_CLF = XGBClassifier(random_state=RANDOM_STATE)
ESTIMATOR_LGBM_CLF = LGBMClassifier(random_state=RANDOM_STATE, verbose=-1, importance_type='gain')
ESTIMATOR_SVM_CLF = SVC(kernel='linear', random_state=RANDOM_STATE)

In [9]:
X = np.array(feature.iloc[:, 1:])
y_tr = np.array(label.iloc[:, [1]]).ravel()
y_tr = np.where(y_tr == "Success", 1, 0)

from sklearn.model_selection import LeaveOneGroupOut

logo = LeaveOneGroupOut()
groups = np.array(feature.iloc[:,[0]]).ravel()

In [10]:
sm = SMOTE(random_state=RANDOM_STATE)
N_FEATURES = 20

RF_accuracies = []
RF_f1_scores_0 = []
RF_f1_scores_1 = []
RF_f1_scores_macro = []
XGB_accuracies = []
XGB_f1_scores_0 = []
XGB_f1_scores_1 = []
XGB_f1_scores_macro = []
LGBM_accuracies = []
LGBM_f1_scores_0 = []
LGBM_f1_scores_1 = []
LGBM_f1_scores_macro = []
MLP_accuracies = []
MLP_f1_scores_0 = []
MLP_f1_scores_1 = []
MLP_f1_scores_macro = []

In [None]:
# classifier
for train_idx, test_idx in logo.split(X, y_tr, groups):
    X_train, X_test = X[train_idx], X[test_idx]
    y_clf_train, y_clf_test = y_tr[train_idx], y_tr[test_idx]
    
    # oversample
    X_train, y_clf_train = sm.fit_resample(X_train, y_clf_train)

    # Recursive Feature Elimination and model building
    models = {
        'RF': ESTIMATOR_RF_CLF,
        'XGB': ESTIMATOR_XGB_CLF,
        'LGBM': ESTIMATOR_LGBM_CLF,
        'MLP': MLPClassifier(hidden_layer_sizes=(100, 30), max_iter=1000, random_state=RANDOM_STATE)#, learning_rate_init = 0.0005)
    }
    
    for model_name, model in models.items():
        if model_name == 'MLP':
            # Directly fit the MLP model without RFE
            model.fit(X_train, y_clf_train)
            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)[:, 1]
            accuracy = model.score(X_test, y_clf_test)
        else:
            # Use RFE for other models
            selector = RFE(model, n_features_to_select=N_FEATURES, step=5)
            selector = selector.fit(X_train, y_clf_train)
            y_pred = selector.predict(X_test)
            y_prob = selector.predict_proba(X_test)[:, 1]
            accuracy = selector.score(X_test, y_clf_test)
            
        f1_score_0 = f1_score(y_clf_test, y_pred, average='binary', pos_label=0)
        f1_score_1 = f1_score(y_clf_test, y_pred, average='binary', pos_label=1)
        f1_score_macro = f1_score(y_clf_test, y_pred, average='macro')

        if model_name == 'RF':
            RF_accuracies.append(accuracy)
            RF_f1_scores_0.append(f1_score_0)
            RF_f1_scores_1.append(f1_score_1)
            RF_f1_scores_macro.append(f1_score_macro)
        elif model_name == 'XGB':
            XGB_accuracies.append(accuracy)
            XGB_f1_scores_0.append(f1_score_0)
            XGB_f1_scores_1.append(f1_score_1)
            XGB_f1_scores_macro.append(f1_score_macro)
        elif model_name == 'LGBM':
            LGBM_accuracies.append(accuracy)
            LGBM_f1_scores_0.append(f1_score_0)
            LGBM_f1_scores_1.append(f1_score_1)
            LGBM_f1_scores_macro.append(f1_score_macro)
        elif model_name == 'MLP':
            MLP_accuracies.append(accuracy)
            MLP_f1_scores_0.append(f1_score_0)
            MLP_f1_scores_1.append(f1_score_1)
            MLP_f1_scores_macro.append(f1_score_macro)

In [15]:
import numpy as np

# Calculate averages and standard deviations for all models
def calculate_stats(accuracies, f1_scores_0, f1_scores_1, f1_scores_macro):
    return {
        'avg_accuracy': np.mean(accuracies),
        'std_accuracy': np.std(accuracies),
        'avg_f1_score_0': np.mean(f1_scores_0),
        'std_f1_0': np.std(f1_scores_0),
        'avg_f1_score_1': np.mean(f1_scores_1),
        'std_f1_1': np.std(f1_scores_1),
        'avg_f1_score_macro': np.mean(f1_scores_macro),
        'std_f1_macro': np.std(f1_scores_macro)
    }

RF_stats = calculate_stats(RF_accuracies, RF_f1_scores_0, RF_f1_scores_1, RF_f1_scores_macro)
XGB_stats = calculate_stats(XGB_accuracies, XGB_f1_scores_0, XGB_f1_scores_1, XGB_f1_scores_macro)
LGBM_stats = calculate_stats(LGBM_accuracies, LGBM_f1_scores_0, LGBM_f1_scores_1, LGBM_f1_scores_macro)
MLP_stats = calculate_stats(MLP_accuracies, MLP_f1_scores_0, MLP_f1_scores_1, MLP_f1_scores_macro)

results = {
    'Random Forest': RF_stats,
    'XGBoost': XGB_stats,
    'LightGBM': LGBM_stats,
    'MLP': MLP_stats
}

# Function to format the results into a table
def format_results_table(results):
    header = (
        "Model              | F1 (Fail) (SD)| F1 (Success) (SD) | Avg. F1 (SD)  | Avg. Accuracy (SD)\n"
        "-------------------|---------------|-------------------|---------------|--------------\n"
    )
    rows = []
    for model, stats in results.items():
        row = (
            f"{model:<18} | "
            f"{stats['avg_f1_score_0']:.3f} ({stats['std_f1_0']:.3f}) | "
            f"{stats['avg_f1_score_1']:.3f} ({stats['std_f1_1']:.3f})     | "
            f"{stats['avg_f1_score_macro']:.3f} ({stats['std_f1_macro']:.3f}) | "
            f"{stats['avg_accuracy']:.3f} ({stats['std_accuracy']:.3f})"
        )
        rows.append(row)
    table = header + "\n".join(rows)
    return table

# Print the formatted results table
print(format_results_table(results))


Model              | F1 (Fail) (SD)| F1 (Success) (SD) | Avg. F1 (SD)  | Avg. Accuracy (SD)
-------------------|---------------|-------------------|---------------|--------------
Random Forest      | 0.441 (0.293) | 0.717 (0.174)     | 0.592 (0.176) | 0.677 (0.158)
XGBoost            | 0.377 (0.271) | 0.674 (0.198)     | 0.526 (0.150) | 0.631 (0.142)
LightGBM           | 0.361 (0.329) | 0.699 (0.241)     | 0.543 (0.200) | 0.669 (0.174)
MLP                | 0.298 (0.303) | 0.647 (0.258)     | 0.472 (0.156) | 0.626 (0.192)
