# Pairwise Transformation

In [2]:
import os
import warnings
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.metrics import accuracy_score
from scipy.stats import pearsonr, kendalltau
from sklearn.model_selection import GroupKFold
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from statsmodels.miscmodels.ordinal_model import OrderedModel
from statsmodels.tools.sm_exceptions import HessianInversionWarning

warnings.simplefilter(action='ignore', category=FutureWarning)


In [3]:
project_directory = r'C:\Users\marco\OneDrive\Desktop\Final Year Project'
os.chdir(project_directory)
base_dir = os.getcwd() 

- Evaluation Arousal

In [5]:
base_dir = 'RECOLA Ranking Algorithms'
arousal_dir = os.path.join(base_dir, 'Random Forest/Arousal')
evaluation_results_dir = os.path.join(base_dir, 'Evaluation/RandomForest')


def pairwise_transformation(X, Y, participant_ids, fold_number):
    transformed_data_path = os.path.join(arousal_dir, f'transformed_data_fold_{fold_number}.pkl')
    labels_path = os.path.join(arousal_dir, f'labels_fold_{fold_number}.pkl')

    # Check if files already exist
    if os.path.exists(transformed_data_path) and os.path.exists(labels_path):
        print(f"Files for fold {fold_number} already exist. Loading from disk.")
        transformed_data = load(transformed_data_path)
        labels = load(labels_path)
    else:
        transformed_data = []
        labels = []
        unique_participants = np.unique(participant_ids)

        for participant in unique_participants:
            participant_mask = participant_ids == participant
            X_participant = X[participant_mask]
            Y_participant = Y[participant_mask]

            for i in range(len(X_participant)):
                for j in range(len(X_participant)):
                    if j == i:
                        continue
                    xi, xj = X_participant[i], X_participant[j]
                    yi, yj = Y_participant[i], Y_participant[j]
                    if yi > yj:
                        transformed_data.append(xi - xj)
                        labels.append(1)
                    elif yi < yj:
                        transformed_data.append(xj - xi)
                        labels.append(0)

        # Save the transformed data and labels to disk
        dump(transformed_data, transformed_data_path)
        dump(labels, labels_path)
        print(f"Pairwise transformation for fold {fold_number} completed and data saved.")

    return transformed_data_path, labels_path

def concordance_correlation_coefficient(y_true, y_pred):
    cor = np.corrcoef(y_true, y_pred)[0][1]
    mean_true, mean_pred = np.mean(y_true), np.mean(y_pred)
    var_true, var_pred = np.var(y_true), np.var(y_pred)
    sd_true, sd_pred = np.std(y_true), np.std(y_pred)
    numerator = 2 * cor * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred)**2
    ccc = numerator / denominator
    return ccc

def pearson_correlation_coefficient(y_true, y_pred):
    pcc, _ = pearsonr(y_true, y_pred)
    return pcc

def kendalls_tau_coefficient(y_true, y_pred):

    tau, _ = kendalltau(y_true, y_pred)
    return tau

def evaluate_individual_performance(clf, X_test, Y_test, group_labels):
    evaluation_results = []

    for participant_id in np.unique(group_labels):

        idx = group_labels == participant_id
        participant_features = X_test[idx]
        participant_labels = Y_test[idx]

        # Compute mean feature values for participant
        mean_features = np.mean(participant_features, axis=0)

        # Pairwise transformation
        transformed_features = participant_features - mean_features

        # Pass the transformed data through the trained RF model to predict probabilities
        predicted_probabilities = clf.predict_proba(transformed_features)[:, 1]

        # Calculate measures with the raw arousal values
        pcc_value = pearsonr(participant_labels, predicted_probabilities)[0]
        ccc_value = concordance_correlation_coefficient(participant_labels, predicted_probabilities)
        kendall_tau_value = kendalltau(participant_labels, predicted_probabilities)[0]

        evaluation_results.append({
            'Participant ID': participant_id,
            'PCC': pcc_value,
            'CCC': ccc_value,
            'KendallTau': kendall_tau_value
        })

    return pd.DataFrame(evaluation_results)

evaluation_results = []

def process_data_median_arousal(file_path):
    target_column = 'median_arousal'
    print(f"\nProcessing data for {target_column}...")

    combined_df = pd.read_csv(file_path)
    print("Dataset loaded.")

    excluded_features = [
        'VIDEO_40_LLD_AU1', 'VIDEO_40_LLD_AU2', 'VIDEO_40_LLD_AU4', 'VIDEO_40_LLD_AU5', 'VIDEO_40_LLD_AU6', 'VIDEO_40_LLD_AU7', 'VIDEO_40_LLD_AU9', 'VIDEO_40_LLD_AU11', 'VIDEO_40_LLD_AU12', 'VIDEO_40_LLD_AU15',
        'VIDEO_40_LLD_AU17', 'VIDEO_40_LLD_AU20', 'VIDEO_40_LLD_AU23', 'VIDEO_40_LLD_AU24', 'VIDEO_40_LLD_AU25', 'VIDEO_40_LLD_Yaw', 'VIDEO_40_LLD_Pitch', 'VIDEO_40_LLD_Roll', 'VIDEO_40_LLD_Opt_mean',
        'VIDEO_40_LLD_Opt_std', 'VIDEO_40_LLD_AU1_delta', 'VIDEO_40_LLD_AU2_delta', 'VIDEO_40_LLD_AU4_delta', 'VIDEO_40_LLD_AU5_delta', 'VIDEO_40_LLD_AU6_delta', 'VIDEO_40_LLD_AU7_delta', 'VIDEO_40_LLD_AU9_delta',
        'VIDEO_40_LLD_AU11_delta', 'VIDEO_40_LLD_AU12_delta', 'VIDEO_40_LLD_AU15_delta', 'VIDEO_40_LLD_AU17_delta', 'VIDEO_40_LLD_AU20_delta', 'VIDEO_40_LLD_AU23_delta', 'VIDEO_40_LLD_AU24_delta', 'VIDEO_40_LLD_AU25_delta',
        'VIDEO_40_LLD_Yaw_delta', 'VIDEO_40_LLD_Pitch_delta', 'VIDEO_40_LLD_Roll_delta', 'VIDEO_40_LLD_Opt_mean_delta', 'VIDEO_40_LLD_Opt_std_delta', 'Face_detection_probability', 'ECG_54_LLD_ECG_HR', 'ECG_54_LLD_ECG_HRV',
        'ECG_54_LLD_ECG_zcr', 'ECG_54_LLD_ECG_FFT_1', 'ECG_54_LLD_ECG_FFT_2', 'ECG_54_LLD_ECG_FFT_3', 'ECG_54_LLD_ECG_FFT_4', 'ECG_54_LLD_ECG_FFT_5', 'ECG_54_LLD_ECG_FFT_6', 'ECG_54_LLD_ECG_FFT_7', 'ECG_54_LLD_ECG_FFT_8',
        'ECG_54_LLD_ECG_FFT_9', 'ECG_54_LLD_ECG_FFT_10', 'ECG_54_LLD_ECG_FFT_11', 'ECG_54_LLD_ECG_FFT_12', 'ECG_54_LLD_ECG_FFT_entropy', 'ECG_54_LLD_ECG_FFT_mean_frequency', 'ECG_54_LLD_ECG_FFT_slope', 'ECG_54_LLD_ECG_mean',
        'ECG_54_LLD_ECG_std', 'ECG_54_LLD_ECG_kurtosis', 'ECG_54_LLD_ECG_skewness', 'ECG_54_LLD_ECG_NSImn', 'ECG_54_LLD_ECG_NLDmn', 'ECG_54_LLD_ECG_VLF', 'ECG_54_LLD_ECG_LF', 'ECG_54_LLD_ECG_HF', 'ECG_54_LLD_ECG_LFHF', 'ECG_54_LLD_ECG_zcr_delta',
        'ECG_54_LLD_ECG_FFT_1_delta', 'ECG_54_LLD_ECG_FFT_2_delta', 'ECG_54_LLD_ECG_FFT_3_delta', 'ECG_54_LLD_ECG_FFT_4_delta', 'ECG_54_LLD_ECG_FFT_5_delta', 'ECG_54_LLD_ECG_FFT_6_delta', 'ECG_54_LLD_ECG_FFT_7_delta', 'ECG_54_LLD_ECG_FFT_8_delta',
        'ECG_54_LLD_ECG_FFT_9_delta', 'ECG_54_LLD_ECG_FFT_10_delta', 'ECG_54_LLD_ECG_FFT_11_delta', 'ECG_54_LLD_ECG_FFT_12_delta', 'ECG_54_LLD_ECG_FFT_entropy_delta', 'ECG_54_LLD_ECG_FFT_mean_frequency_delta', 'ECG_54_LLD_ECG_FFT_slope_delta', 'ECG_54_LLD_ECG_mean_delta',
        'ECG_54_LLD_ECG_std_delta', 'ECG_54_LLD_ECG_kurtosis_delta', 'ECG_54_LLD_ECG_skewness_delta', 'ECG_54_LLD_ECG_NSImn_delta', 'ECG_54_LLD_ECG_NLDmn_delta', 'ECG_54_LLD_ECG_VLF_delta', 'ECG_54_LLD_ECG_LF_delta', 'ECG_54_LLD_ECG_HF_delta', 'ECG_54_LLD_ECG_LFHF_delta',
        'EDA_62_LLD_time_code', 'EDA_62_LLD_EDA_slope', 'EDA_62_LLD_EDA_std', 'EDA_62_LLD_SCR_FFT_entropy', 'EDA_62_LLD_SCR_FFT_mean_frequency', 'EDA_62_LLD_EDA_mean', 'EDA_62_LLD_EDA_meanD', 'EDA_62_LLD_EDA_meanDneg', 'EDA_62_LLD_EDA_prop', 'EDA_62_LLD_EDA_Xbound', 'EDA_62_LLD_EDA_kurtosis',
        'EDA_62_LLD_EDA_skewness', 'EDA_62_LLD_EDA_NSImn', 'EDA_62_LLD_EDA_NLDmn', 'EDA_62_LLD_SCL_mean', 'EDA_62_LLD_SCL_meanD', 'EDA_62_LLD_SCL_meanDneg', 'EDA_62_LLD_SCL_prop', 'EDA_62_LLD_SCL_Xbound', 'EDA_62_LLD_SCL_kurtosis', 'EDA_62_LLD_SCL_skewness', 'EDA_62_LLD_SCL_NSImn',
        'EDA_62_LLD_SCL_NLDmn', 'EDA_62_LLD_SCR_mean', 'EDA_62_LLD_SCR_meanD', 'EDA_62_LLD_SCR_meanDneg', 'EDA_62_LLD_SCR_prop', 'EDA_62_LLD_SCR_Xbound', 'EDA_62_LLD_SCR_kurtosis', 'EDA_62_LLD_SCR_skewness', 'EDA_62_LLD_SCR_NSImn', 'EDA_62_LLD_SCR_NLDmn', 'EDA_62_LLD_EDA_slope_delta',
        'EDA_62_LLD_EDA_std_delta', 'EDA_62_LLD_SCR_FFT_entropy_delta', 'EDA_62_LLD_SCR_FFT_mean_frequency_delta', 'EDA_62_LLD_EDA_mean_delta', 'EDA_62_LLD_EDA_meanD_delta', 'EDA_62_LLD_EDA_meanDneg_delta', 'EDA_62_LLD_EDA_prop_delta', 'EDA_62_LLD_EDA_Xbound_delta', 'EDA_62_LLD_EDA_kurtosis_delta',
        'EDA_62_LLD_EDA_skewness_delta', 'EDA_62_LLD_EDA_NSImn_delta', 'EDA_62_LLD_EDA_NLDmn_delta', 'EDA_62_LLD_SCL_mean_delta', 'EDA_62_LLD_SCL_meanD_delta', 'EDA_62_LLD_SCL_meanDneg_delta', 'EDA_62_LLD_SCL_prop_delta', 'EDA_62_LLD_SCL_Xbound_delta', 'EDA_62_LLD_SCL_kurtosis_delta', 'EDA_62_LLD_SCL_skewness_delta',
        'EDA_62_LLD_SCL_NSImn_delta', 'EDA_62_LLD_SCL_NLDmn_delta', 'EDA_62_LLD_SCR_mean_delta', 'EDA_62_LLD_SCR_meanD_delta', 'EDA_62_LLD_SCR_meanDneg_delta', 'EDA_62_LLD_SCR_prop_delta', 'EDA_62_LLD_SCR_Xbound_delta', 'EDA_62_LLD_SCR_kurtosis_delta', 'EDA_62_LLD_SCR_skewness_delta', 'EDA_62_LLD_SCR_NSImn_delta', 'EDA_62_LLD_SCR_NLDmn_delta'

    ]

    features = [col for col in combined_df.columns if col not in excluded_features + ['participant_id','median_arousal', 'median_valence', 'time_window']]

    X = combined_df[features].values
    Y = combined_df[target_column].values
    participant_ids = combined_df['participant_id'].values

    print("Starting cross-validation for arousal...")
    group_kfold = GroupKFold(n_splits=10)

    transformed_data_all = []
    labels_all = []

    for fold, (train_idx, test_idx) in enumerate(group_kfold.split(X, Y, groups=participant_ids)):
        print(f"\n=====================================")
        print(f"Fold {fold+1}/{10}")
        X_train = X[train_idx]
        Y_train = Y[train_idx]
        participants_train = participant_ids[train_idx]
        X_test, Y_test = X[test_idx], Y[test_idx]
        group_labels_test = participant_ids[test_idx]
        # Perform pairwise transformation 
        transformed_data_path, labels_path = pairwise_transformation(X_train, Y_train, participants_train, fold)

        # Load the transformed data and labels from the saved files
        X_train_transformed = load(transformed_data_path)
        Y_train_transformed = load(labels_path)

        clf = RandomForestClassifier(n_estimators=15, max_depth=10, max_features='sqrt', min_samples_split=4, min_samples_leaf=2, n_jobs=-1)
        print("Starting Random Forest training...")
        clf.fit(X_train_transformed, Y_train_transformed)

        # Evaluate individual performance
        individual_results_df = evaluate_individual_performance(clf, X_test, Y_test, group_labels_test)
        evaluation_results.append(individual_results_df)

    combined_results_df = pd.concat(evaluation_results, ignore_index=True)
    combined_results_csv_path = os.path.join(evaluation_results_dir, 'random_forest_evaluation_arousal.csv')
    combined_results_df.to_csv(combined_results_csv_path, index=False)
    print("Evaluation results saved.")

input_path = os.path.join(base_dir, 'RECOLA_Intervals_Data','ArousalValenceTimeSeries.csv')
process_data_median_arousal(input_path)


Processing data for median_arousal...
Dataset loaded.
Starting cross-validation for arousal...

Fold 1/10
Files for fold 0 already exist. Loading from disk.
Starting Random Forest training...

Fold 2/10
Files for fold 1 already exist. Loading from disk.
Starting Random Forest training...

Fold 3/10
Files for fold 2 already exist. Loading from disk.
Starting Random Forest training...

Fold 4/10
Files for fold 3 already exist. Loading from disk.
Starting Random Forest training...

Fold 5/10
Files for fold 4 already exist. Loading from disk.
Starting Random Forest training...

Fold 6/10
Files for fold 5 already exist. Loading from disk.
Starting Random Forest training...

Fold 7/10
Files for fold 6 already exist. Loading from disk.
Starting Random Forest training...

Fold 8/10
Files for fold 7 already exist. Loading from disk.
Starting Random Forest training...

Fold 9/10
Pairwise transformation for fold 8 completed and data saved.
Starting Random Forest training...

Fold 10/10
Pairwise 

- Evaluation valence

In [6]:
base_dir = 'RECOLA Ranking Algorithms'
valence_dir = os.path.join(base_dir, 'Random Forest/Valence')
evaluation_results_dir = os.path.join(base_dir, 'Evaluation/RandomForest')

def pairwise_transformation(X, Y, participant_ids, fold_number):
    transformed_data_path = os.path.join(valence_dir, f'transformed_data_fold_{fold_number}.pkl')
    labels_path = os.path.join(valence_dir, f'labels_fold_{fold_number}.pkl')

    # Check if files already exist
    if os.path.exists(transformed_data_path) and os.path.exists(labels_path):
        print(f"Files for fold {fold_number} already exist. Loading from disk.")
        transformed_data = load(transformed_data_path)
        labels = load(labels_path)
    else:
        transformed_data = []
        labels = []
        unique_participants = np.unique(participant_ids)

        for participant in unique_participants:
            participant_mask = participant_ids == participant
            X_participant = X[participant_mask]
            Y_participant = Y[participant_mask]

            for i in range(len(X_participant)):
                for j in range(len(X_participant)):
                    if j == i:
                        continue
                    xi, xj = X_participant[i], X_participant[j]
                    yi, yj = Y_participant[i], Y_participant[j]
                    if yi > yj:
                        transformed_data.append(xi - xj)
                        labels.append(1)
                    elif yi < yj:
                        transformed_data.append(xj - xi)
                        labels.append(0)

        # Save the transformed data 
        dump(transformed_data, transformed_data_path)
        dump(labels, labels_path)
        print(f"Pairwise transformation for fold {fold_number} completed and data saved.")

    return transformed_data_path, labels_path

def concordance_correlation_coefficient(y_true, y_pred):
    cor = np.corrcoef(y_true, y_pred)[0][1]
    mean_true, mean_pred = np.mean(y_true), np.mean(y_pred)
    var_true, var_pred = np.var(y_true), np.var(y_pred)
    sd_true, sd_pred = np.std(y_true), np.std(y_pred)
    numerator = 2 * cor * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred)**2
    ccc = numerator / denominator
    return ccc

def pearson_correlation_coefficient(y_true, y_pred):
    pcc, _ = pearsonr(y_true, y_pred)
    return pcc

def kendalls_tau_coefficient(y_true, y_pred):

    tau, _ = kendalltau(y_true, y_pred)
    return tau

def evaluate_individual_performance(clf, X_test, Y_test, group_labels):
    evaluation_results = []

    for participant_id in np.unique(group_labels):

        idx = group_labels == participant_id
        participant_features = X_test[idx]
        participant_labels = Y_test[idx]

        # Compute mean feature values for participant
        mean_features = np.mean(participant_features, axis=0)

        # Pairwise transformation
        transformed_features = participant_features - mean_features

        # Pass the transformed data through the trained RF model to predict probabilities
        predicted_probabilities = clf.predict_proba(transformed_features)[:, 1]

        # Calculate measures with the raw arousal values
        pcc_value = pearsonr(participant_labels, predicted_probabilities)[0]
        ccc_value = concordance_correlation_coefficient(participant_labels, predicted_probabilities)
        kendall_tau_value = kendalltau(participant_labels, predicted_probabilities)[0]

        evaluation_results.append({
            'Participant ID': participant_id,
            'PCC': pcc_value,
            'CCC': ccc_value,
            'KendallTau': kendall_tau_value
        })

    return pd.DataFrame(evaluation_results)

evaluation_results = []

def process_data_median_valence(file_path):
    target_column = 'median_valence'
    print(f"\nProcessing data for {target_column}...")

    combined_df = pd.read_csv(file_path)
    print("Dataset loaded.")

    
    excluded_features = [
        'VIDEO_40_LLD_AU1', 'VIDEO_40_LLD_AU2', 'VIDEO_40_LLD_AU4', 'VIDEO_40_LLD_AU5', 'VIDEO_40_LLD_AU6', 'VIDEO_40_LLD_AU7', 'VIDEO_40_LLD_AU9', 'VIDEO_40_LLD_AU11', 'VIDEO_40_LLD_AU12', 'VIDEO_40_LLD_AU15',
        'VIDEO_40_LLD_AU17', 'VIDEO_40_LLD_AU20', 'VIDEO_40_LLD_AU23', 'VIDEO_40_LLD_AU24', 'VIDEO_40_LLD_AU25', 'VIDEO_40_LLD_Yaw', 'VIDEO_40_LLD_Pitch', 'VIDEO_40_LLD_Roll', 'VIDEO_40_LLD_Opt_mean',
        'VIDEO_40_LLD_Opt_std', 'VIDEO_40_LLD_AU1_delta', 'VIDEO_40_LLD_AU2_delta', 'VIDEO_40_LLD_AU4_delta', 'VIDEO_40_LLD_AU5_delta', 'VIDEO_40_LLD_AU6_delta', 'VIDEO_40_LLD_AU7_delta', 'VIDEO_40_LLD_AU9_delta',
        'VIDEO_40_LLD_AU11_delta', 'VIDEO_40_LLD_AU12_delta', 'VIDEO_40_LLD_AU15_delta', 'VIDEO_40_LLD_AU17_delta', 'VIDEO_40_LLD_AU20_delta', 'VIDEO_40_LLD_AU23_delta', 'VIDEO_40_LLD_AU24_delta', 'VIDEO_40_LLD_AU25_delta',
        'VIDEO_40_LLD_Yaw_delta', 'VIDEO_40_LLD_Pitch_delta', 'VIDEO_40_LLD_Roll_delta', 'VIDEO_40_LLD_Opt_mean_delta', 'VIDEO_40_LLD_Opt_std_delta', 'Face_detection_probability', 'ECG_54_LLD_ECG_HR', 'ECG_54_LLD_ECG_HRV',
        'ECG_54_LLD_ECG_zcr', 'ECG_54_LLD_ECG_FFT_1', 'ECG_54_LLD_ECG_FFT_2', 'ECG_54_LLD_ECG_FFT_3', 'ECG_54_LLD_ECG_FFT_4', 'ECG_54_LLD_ECG_FFT_5', 'ECG_54_LLD_ECG_FFT_6', 'ECG_54_LLD_ECG_FFT_7', 'ECG_54_LLD_ECG_FFT_8',
        'ECG_54_LLD_ECG_FFT_9', 'ECG_54_LLD_ECG_FFT_10', 'ECG_54_LLD_ECG_FFT_11', 'ECG_54_LLD_ECG_FFT_12', 'ECG_54_LLD_ECG_FFT_entropy', 'ECG_54_LLD_ECG_FFT_mean_frequency', 'ECG_54_LLD_ECG_FFT_slope', 'ECG_54_LLD_ECG_mean',
        'ECG_54_LLD_ECG_std', 'ECG_54_LLD_ECG_kurtosis', 'ECG_54_LLD_ECG_skewness', 'ECG_54_LLD_ECG_NSImn', 'ECG_54_LLD_ECG_NLDmn', 'ECG_54_LLD_ECG_VLF', 'ECG_54_LLD_ECG_LF', 'ECG_54_LLD_ECG_HF', 'ECG_54_LLD_ECG_LFHF', 'ECG_54_LLD_ECG_zcr_delta',
        'ECG_54_LLD_ECG_FFT_1_delta', 'ECG_54_LLD_ECG_FFT_2_delta', 'ECG_54_LLD_ECG_FFT_3_delta', 'ECG_54_LLD_ECG_FFT_4_delta', 'ECG_54_LLD_ECG_FFT_5_delta', 'ECG_54_LLD_ECG_FFT_6_delta', 'ECG_54_LLD_ECG_FFT_7_delta', 'ECG_54_LLD_ECG_FFT_8_delta',
        'ECG_54_LLD_ECG_FFT_9_delta', 'ECG_54_LLD_ECG_FFT_10_delta', 'ECG_54_LLD_ECG_FFT_11_delta', 'ECG_54_LLD_ECG_FFT_12_delta', 'ECG_54_LLD_ECG_FFT_entropy_delta', 'ECG_54_LLD_ECG_FFT_mean_frequency_delta', 'ECG_54_LLD_ECG_FFT_slope_delta', 'ECG_54_LLD_ECG_mean_delta',
        'ECG_54_LLD_ECG_std_delta', 'ECG_54_LLD_ECG_kurtosis_delta', 'ECG_54_LLD_ECG_skewness_delta', 'ECG_54_LLD_ECG_NSImn_delta', 'ECG_54_LLD_ECG_NLDmn_delta', 'ECG_54_LLD_ECG_VLF_delta', 'ECG_54_LLD_ECG_LF_delta', 'ECG_54_LLD_ECG_HF_delta', 'ECG_54_LLD_ECG_LFHF_delta',
        'EDA_62_LLD_time_code', 'EDA_62_LLD_EDA_slope', 'EDA_62_LLD_EDA_std', 'EDA_62_LLD_SCR_FFT_entropy', 'EDA_62_LLD_SCR_FFT_mean_frequency', 'EDA_62_LLD_EDA_mean', 'EDA_62_LLD_EDA_meanD', 'EDA_62_LLD_EDA_meanDneg', 'EDA_62_LLD_EDA_prop', 'EDA_62_LLD_EDA_Xbound', 'EDA_62_LLD_EDA_kurtosis',
        'EDA_62_LLD_EDA_skewness', 'EDA_62_LLD_EDA_NSImn', 'EDA_62_LLD_EDA_NLDmn', 'EDA_62_LLD_SCL_mean', 'EDA_62_LLD_SCL_meanD', 'EDA_62_LLD_SCL_meanDneg', 'EDA_62_LLD_SCL_prop', 'EDA_62_LLD_SCL_Xbound', 'EDA_62_LLD_SCL_kurtosis', 'EDA_62_LLD_SCL_skewness', 'EDA_62_LLD_SCL_NSImn',
        'EDA_62_LLD_SCL_NLDmn', 'EDA_62_LLD_SCR_mean', 'EDA_62_LLD_SCR_meanD', 'EDA_62_LLD_SCR_meanDneg', 'EDA_62_LLD_SCR_prop', 'EDA_62_LLD_SCR_Xbound', 'EDA_62_LLD_SCR_kurtosis', 'EDA_62_LLD_SCR_skewness', 'EDA_62_LLD_SCR_NSImn', 'EDA_62_LLD_SCR_NLDmn', 'EDA_62_LLD_EDA_slope_delta',
        'EDA_62_LLD_EDA_std_delta', 'EDA_62_LLD_SCR_FFT_entropy_delta', 'EDA_62_LLD_SCR_FFT_mean_frequency_delta', 'EDA_62_LLD_EDA_mean_delta', 'EDA_62_LLD_EDA_meanD_delta', 'EDA_62_LLD_EDA_meanDneg_delta', 'EDA_62_LLD_EDA_prop_delta', 'EDA_62_LLD_EDA_Xbound_delta', 'EDA_62_LLD_EDA_kurtosis_delta',
        'EDA_62_LLD_EDA_skewness_delta', 'EDA_62_LLD_EDA_NSImn_delta', 'EDA_62_LLD_EDA_NLDmn_delta', 'EDA_62_LLD_SCL_mean_delta', 'EDA_62_LLD_SCL_meanD_delta', 'EDA_62_LLD_SCL_meanDneg_delta', 'EDA_62_LLD_SCL_prop_delta', 'EDA_62_LLD_SCL_Xbound_delta', 'EDA_62_LLD_SCL_kurtosis_delta', 'EDA_62_LLD_SCL_skewness_delta',
        'EDA_62_LLD_SCL_NSImn_delta', 'EDA_62_LLD_SCL_NLDmn_delta', 'EDA_62_LLD_SCR_mean_delta', 'EDA_62_LLD_SCR_meanD_delta', 'EDA_62_LLD_SCR_meanDneg_delta', 'EDA_62_LLD_SCR_prop_delta', 'EDA_62_LLD_SCR_Xbound_delta', 'EDA_62_LLD_SCR_kurtosis_delta', 'EDA_62_LLD_SCR_skewness_delta', 'EDA_62_LLD_SCR_NSImn_delta', 'EDA_62_LLD_SCR_NLDmn_delta'

    ]

    features = [col for col in combined_df.columns if col not in excluded_features + ['participant_id','median_arousal', 'median_valence', 'time_window', target_column]]

    X = combined_df[features].values
    Y = combined_df[target_column].values
    participant_ids = combined_df['participant_id'].values

    print("Starting cross-validation for median arousal...")
    group_kfold = GroupKFold(n_splits=10)

    transformed_data_all = []
    labels_all = []

    for fold, (train_idx, test_idx) in enumerate(group_kfold.split(X, Y, groups=participant_ids)):
        print(f"\n=====================================")
        print(f"Fold {fold+1}/{10}")
        X_train = X[train_idx]
        Y_train = Y[train_idx]
        participants_train = participant_ids[train_idx]
        X_test, Y_test = X[test_idx], Y[test_idx]
        group_labels_test = participant_ids[test_idx]
        # Perform pairwise transformation
        transformed_data_path, labels_path = pairwise_transformation(X_train, Y_train, participants_train, fold)

        # Load the transformed data 
        X_train_transformed = load(transformed_data_path)
        Y_train_transformed = load(labels_path)

        clf = RandomForestClassifier(n_estimators=15, max_depth=10, max_features='sqrt', min_samples_split=4, min_samples_leaf=2, n_jobs=-1)
        print("Starting Random Forest training...")
        clf.fit(X_train_transformed, Y_train_transformed)

        # Evaluate individual performance
        individual_results_df = evaluate_individual_performance(clf, X_test, Y_test, group_labels_test)
        evaluation_results.append(individual_results_df)

    combined_results_df = pd.concat(evaluation_results, ignore_index=True)
    combined_results_csv_path = os.path.join(evaluation_results_dir, 'random_forest_evaluation_valence.csv')
    combined_results_df.to_csv(combined_results_csv_path, index=False)
    print("Evaluation results saved.")

input_path = os.path.join('RECOLA Ranking Algorithms', 'RECOLA_Intervals_Data','ArousalValenceTimeSeries.csv')
process_data_median_valence(input_path)


Processing data for median_valence...
Dataset loaded.
Starting cross-validation for median arousal...

Fold 1/10
Pairwise transformation for fold 0 completed and data saved.
Starting Random Forest training...

Fold 2/10
Pairwise transformation for fold 1 completed and data saved.
Starting Random Forest training...

Fold 3/10
Pairwise transformation for fold 2 completed and data saved.
Starting Random Forest training...

Fold 4/10
Pairwise transformation for fold 3 completed and data saved.
Starting Random Forest training...

Fold 5/10
Pairwise transformation for fold 4 completed and data saved.
Starting Random Forest training...

Fold 6/10
Pairwise transformation for fold 5 completed and data saved.
Starting Random Forest training...

Fold 7/10
Pairwise transformation for fold 6 completed and data saved.
Starting Random Forest training...

Fold 8/10
Pairwise transformation for fold 7 completed and data saved.
Starting Random Forest training...

Fold 9/10
Pairwise transformation for fo

# Ordinal Logistic Regression

In [6]:
def label_quartiles(df, feature, label_col):
    for participant in df['participant_id'].unique():
        
        participant_data = df[df['participant_id'] == participant]
        
        quartile_10 = np.percentile(participant_data[feature], 10)  
        quartile_25 = np.percentile(participant_data[feature], 25)
        quartile_50 = np.percentile(participant_data[feature], 50)
        quartile_75 = np.percentile(participant_data[feature], 75)
        
        percentiles = [
            np.percentile(participant_data[feature], 86),
            np.percentile(participant_data[feature], 72),
            np.percentile(participant_data[feature], 58),
            np.percentile(participant_data[feature], 43),
            np.percentile(participant_data[feature], 28),
            np.percentile(participant_data[feature], 14),
        ]
        
        # Combine all thresholds 
        bins = [-np.inf] + sorted([quartile_10, quartile_25, quartile_50, quartile_75] + percentiles) + [np.inf]
        
        labels = np.digitize(participant_data[feature], bins) - 1
        df.loc[participant_data.index, label_col] = labels

def concordance_correlation_coefficient(y_true, y_pred):
    cor = np.corrcoef(y_true, y_pred)[0][1]
    mean_true, mean_pred = np.mean(y_true), np.mean(y_pred)
    var_true, var_pred = np.var(y_true), np.var(y_pred)
    sd_true, sd_pred = np.std(y_true), np.std(y_pred)
    numerator = 2 * cor * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred)**2
    return numerator / denominator

def pearson_correlation_coefficient(y_true, y_pred):
    pcc, _ = pearsonr(y_true, y_pred)
    return pcc

def kendalls_tau_coefficient(y_true, y_pred):

    tau, _ = kendalltau(y_true, y_pred)
    return tau

def evaluate_individual_performance(X_test, Y_test, group_labels, linpred_test):
    evaluation_results = []
    for participant_id in np.unique(group_labels):
        idx = group_labels == participant_id
        participant_labels = Y_test[idx]
        participant_predictions = linpred_test[idx]
        pcc_value = pearsonr(participant_labels, participant_predictions)[0]
        ccc_value = concordance_correlation_coefficient(participant_labels, participant_predictions)
        kendall_tau_value = kendalltau(participant_labels, participant_predictions)[0]
        evaluation_results.append({
            'Participant ID': participant_id,
            'PCC': pcc_value,
            'CCC': ccc_value,
            'KendallTau': kendall_tau_value
        })
    return pd.DataFrame(evaluation_results)

def process_target(df, features, label_col, participant_ids, group_kfold, target_name):
    evaluation_results = []
    X = df[features].values
    Y = df[label_col].values

    for train_index, test_index in group_kfold.split(X, Y, groups=participant_ids):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        group_labels_test = df['participant_id'].iloc[test_index].values

        with warnings.catch_warnings():
            warnings.simplefilter('ignore', HessianInversionWarning)
            model = OrderedModel(Y_train, X_train, distr='logit')
            result = model.fit(method='bfgs', disp=False)

            linpred_test = result.predict(X_test, which='linpred')
            individual_results_df = evaluate_individual_performance(X_test, Y_test, group_labels_test, linpred_test)
            evaluation_results.append(individual_results_df)

    final_results_df = pd.concat(evaluation_results, ignore_index=True)
    output_folder = 'RECOLA Ranking Algorithms/Evaluation/Ordinal Logistic Regression'
    output_filename = os.path.join(output_folder, f"{target_name}_evaluation_results.csv")
    final_results_df.to_csv(output_filename, index=False)
    print(f"Results for {target_name} saved in {output_filename}.")

def process_data(filepath):
    df = pd.read_csv(filepath)
    df['arousal_quartile'] = -1
    df['valence_quartile'] = -1

    label_quartiles(df, 'median_arousal', 'arousal_quartile')
    label_quartiles(df, 'median_valence', 'valence_quartile')

    group_kfold = GroupKFold(n_splits=10)
    participant_ids = df['participant_id'].values
    features = [col for col in df.columns if col not in ['participant_id', 'median_arousal', 'median_valence', 'time_window', 'arousal_quartile', 'valence_quartile']]

    process_target(df, features, 'arousal_quartile', participant_ids, group_kfold, 'arousal')
    process_target(df, features, 'valence_quartile', participant_ids, group_kfold, 'valence')

filepath = os.path.join(base_dir,'RECOLA Ranking Algorithms', 'RECOLA_Intervals_Data', 'ArousalValenceTimeSeries_Quartiles.csv')
if os.path.exists(filepath):
    process_data(filepath)
else:
    print(f"File {filepath} not found. Please ensure it exists in the specified path.")




Results for arousal saved in RECOLA Ranking Algorithms/Evaluation/Ordinal Logistic Regression\arousal_evaluation_results.csv.




Results for valence saved in RECOLA Ranking Algorithms/Evaluation/Ordinal Logistic Regression\valence_evaluation_results.csv.




## Regression

In [4]:
def concordance_correlation_coefficient(y_true, y_pred):
    cor = np.corrcoef(y_true, y_pred)[0][1]
    mean_true, mean_pred = np.mean(y_true), np.mean(y_pred)
    var_true, var_pred = np.var(y_true), np.var(y_pred)
    sd_true, sd_pred = np.std(y_true), np.std(y_pred)
    numerator = 2 * cor * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred)**2
    return numerator / denominator

def evaluate_individual_performance( Y_test, group_labels, predictions):
    evaluation_results = []
    for participant_id in np.unique(group_labels):
        idx = group_labels == participant_id
        participant_labels = Y_test[idx]
        participant_predictions = predictions[idx]
        pcc_value = pearsonr(participant_labels, participant_predictions)[0]
        ccc_value = concordance_correlation_coefficient(participant_labels, participant_predictions)
        kendall_tau_value = kendalltau(participant_labels, participant_predictions)[0]
        evaluation_results.append({
            'Participant ID': participant_id,
            'PCC': pcc_value,
            'CCC': ccc_value,
            'KendallTau': kendall_tau_value
        })
    return evaluation_results

def process_target(df, features, continuous_target, participant_ids, group_kfold, model_type):
    all_results = []
    X = df[features].values
    Y = df[continuous_target].values

    for train_index, test_index in group_kfold.split(X, Y, groups=participant_ids):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        group_labels_test = df['participant_id'].iloc[test_index].values

        with warnings.catch_warnings():
            warnings.simplefilter('ignore', HessianInversionWarning)

            if model_type == 'linear':
                reg = LinearRegression().fit(X_train, Y_train)
                y_linear_model_pred = reg.predict(X_test)
                all_results.extend(evaluate_individual_performance(Y_test, group_labels_test, y_linear_model_pred))
            elif model_type == 'random_forest':
                reg = RandomForestRegressor(random_state=0).fit(X_train, Y_train)
                y_random_forest_pred = reg.predict(X_test)
                all_results.extend(evaluate_individual_performance(Y_test, group_labels_test, y_random_forest_pred))
            elif model_type == 'mlp':
                reg = MLPRegressor(hidden_layer_sizes=(64, 32), random_state=1, max_iter=1000).fit(X_train, Y_train)
                y_mlp_pred = reg.predict(X_test)
                all_results.extend(evaluate_individual_performance(Y_test, group_labels_test, y_mlp_pred))

    return all_results


def process_data(filepath):
    df = pd.read_csv(filepath)
    group_kfold = GroupKFold(n_splits=10)
    participant_ids = df['participant_id'].values
    features = [col for col in df.columns if col not in ['participant_id','median_arousal','median_valence','time_window','time in seconds','FM1 _x', 'FM2 _x', 'FM3 _x', 'FF1 _x', 'FF2 _x', 'FF3_x','FM1 _y', 'FM2 _y', 'FM3 _y', 'FF1 _y', 'FF2 _y', 'FF3_y']]

    for target_col, target_name in [('median_arousal', 'arousal'), ('median_valence', 'valence')]:
        for model in ['linear', 'random_forest', 'mlp']:
            results = process_target(df, features, target_col, participant_ids, group_kfold, model)
            pd.DataFrame(results).to_csv(f'RECOLA Ranking Algorithms\Evaluation\Regression\{model}_regression_evaluation_{target_name}.csv', index=False)


filepath = os.path.join(base_dir, 'RECOLA Ranking Algorithms', 'RECOLA_Intervals_Data', 'ArousalValenceTimeSeries.csv')
if os.path.exists(filepath):
    process_data(filepath)
else:
    print(f"File {filepath} not found. Please ensure it exists in the specified path.")
