# Set Parameters

In [52]:
PAT_NOW = "S21_160"
PAT_SHORT_NAME = "S_160"

MOOD_TRACKING_SHEET_PATH = f'/home/klab/NAS/Analysis/AudioFacialEEG/Behavioral Labeling/Mood_Tracking.xlsx'

BEHAVIORAL_LABELS_SHEET_PATH = f'/home/klab/NAS/Analysis/AudioFacialEEG/Behavioral Labeling/Behavior_Labeling.xlsx'

VIDEO_TIMESTAMPS_SHEET_PATH = f'/home/klab/NAS/Analysis/AudioFacialEEG/Behavioral Labeling/videoDateTimes/VideoDatetimes{PAT_SHORT_NAME[1:]}.xlsx'

OPENFACE_OUTPUT_DIRECTORY = f'/home/klab/NAS/Analysis/outputs_OpenFace/{PAT_NOW}/'
COMBINED_OUTPUT_DIRECTORY = f'/home/klab/NAS/Analysis/outputs_Combined/{PAT_NOW}/'

RUNTIME_VAR_PATH = '/home/klab/NAS/Analysis/AudioFacialEEG/Runtime_Vars/'
RESULTS_PATH_BASE = f'/home/klab/NAS/Analysis/AudioFacialEEG/Results/{PAT_SHORT_NAME}/'
FEATURE_VIS_PATH = f'/home/klab/NAS/Analysis/AudioFacialEEG/Feature_Visualization/{PAT_SHORT_NAME}/'
FEATURE_LABEL_PATH = '/home/klab/NAS/Analysis/AudioFacialEEG/Feature_Labels/'
QC_PATH = '/home/klab/NAS/Analysis/AudioFacialEEG/Quality_Control/'

In [53]:
EMO_FEATURE_SETTING = 2

# 0 - Our Custom AU --> Emotions, with all emotions
# 1 - Our Custom AU --> Emotions, with just OpenDBM's emotions
# 2 - OpenDBM's AU--> Emotions

In [54]:
STATS_FEATURE_SETTING = 3

# 0 - Our new features (including autocorrelation, kurtosis, etc.)
# 1 - Our new features, excluding extras like autocorrelation and kurtosis
# 2 - Just pres_pct
# 3 - Our new features, excluding extras. Do NOT threshold AUs before computing metrics. HSE gets 5 event features. OGAU gets num events and presence percent.

In [55]:
NORMALIZE_DATA = 0

# 0 - No time series normalization
# 1 - Yes time series normalization (for each time window)

# Installs & Setup

In [56]:

import pandas as pd
import numpy as np
import os



In [57]:
import warnings
import pandas as pd

# Ignore all warnings
pd.options.mode.chained_assignment = None
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings('ignore')


# Runtime Variables

In [58]:
# SAVE VARIABLES
import pickle


def get_var_name(our_variable):
    namespace = globals()
    for name, obj in namespace.items():
        if obj is our_variable:
            return name
    return None

# Save the dictionary to a file using pickle
def save_var(our_variable, RUNTIME_VAR_PATH=RUNTIME_VAR_PATH, forced_name=None):
  if forced_name is None:
    name_now = get_var_name(our_variable)
  else:
    name_now = forced_name

  with open(RUNTIME_VAR_PATH + f'{name_now}.pkl', 'wb') as file:
      pickle.dump(our_variable, file)

def load_var(variable_name, RUNTIME_VAR_PATH=RUNTIME_VAR_PATH):
  # Load from the file
  with open(RUNTIME_VAR_PATH + f'{variable_name}.pkl', 'rb') as file:
      return pickle.load(file)


# Video Timestamps

In [59]:
df_videoTimestamps = pd.read_excel(VIDEO_TIMESTAMPS_SHEET_PATH, sheet_name=f'VideoDatetimes_{PAT_SHORT_NAME.split("_")[-1]}')
df_videoTimestamps['Filename'] = df_videoTimestamps['Filename'].str.replace('.m2t', '')

if PAT_SHORT_NAME == 'S_199':
  # There's no H01 video, so let's drop that filename
  df_videoTimestamps = df_videoTimestamps.drop(211)

In [60]:
# Check for any missing videos!

def print_difference(list1, list2):
    for item in list1:
        if item not in list2:
            print(item)

filenames_master_list = list(df_videoTimestamps['Filename'].values)
filenames_we_have = [i[:-4] for i in os.listdir(COMBINED_OUTPUT_DIRECTORY)]

print_difference(filenames_master_list, filenames_we_have)

18926Y00
18926Z00


In [61]:
df_videoTimestamps[-5:]

Unnamed: 0,Filename,VideoStart,VideoEnd
229,1892DB00,2021-03-01 05:53:51,2021-03-01 06:53:47
230,1892DC00,2021-03-01 06:53:51,2021-03-01 07:53:47
231,1892DD00,2021-03-01 07:53:51,2021-03-01 08:53:47
232,1892DE00,2021-03-01 08:53:51,2021-03-01 09:53:46
233,1892DJ00,2021-03-01 13:53:51,2021-03-01 14:36:00


# Danny's Labels (Smile, Laugh, etc.)

In [62]:
df = pd.read_excel(BEHAVIORAL_LABELS_SHEET_PATH, sheet_name=PAT_NOW, dtype={'Filename': str})

columns_to_keep = ['Filename', 'Time Start', 'Time End', 'Behavior']  # List of columns to keep

df = df[columns_to_keep]

df['Behavior'] = df['Behavior'].str.lower()
df.dropna(how='any', inplace=True)


# Function to correct the time format
def correct_time_format(time_str):
    time_str = str(time_str).strip()
    
    # Try parsing the time as "HH:MM:SS" first
    time_val = pd.to_datetime(time_str, format='%H:%M:%S', errors='coerce')
    if pd.isna(time_val):
        # If parsing failed, try "M:SS" or "MM:SS" next
        time_val = pd.to_datetime(time_str, format='%M:%S', errors='coerce')
        if pd.isna(time_val):
            # If still fails, handle or raise error
            print(time_str)
            raise ValueError("Invalid time format")
        else:
            # Adjust for "M:SS" or "MM:SS" as "00:M:SS"
            corrected_time = pd.to_datetime(f"00:{time_str}", format='%H:%M:%S').time()
    else:
        corrected_time = time_val.time()

    return corrected_time


# Apply the correction to 'Time Start' and 'Time End'
df['Time Start'] = df['Time Start'].apply(correct_time_format)
df['Time End'] = df['Time End'].apply(correct_time_format)


Danny_Labels = df

In [63]:
import re

Danny_Labels.dropna(how='any', inplace=True)
Danny_Labels.loc[:, Danny_Labels.columns.str.contains('Time')] = Danny_Labels.loc[:, Danny_Labels.columns.str.contains('Time')].applymap(lambda x: x.replace(' ', '') if isinstance(x, str) else x)
Danny_Labels.loc[:, Danny_Labels.columns.str.contains('Time')] = Danny_Labels.loc[:, Danny_Labels.columns.str.contains('Time')].applymap(lambda x: re.sub(r'(?<=:)(\d)(?=:)', r'0\1', x) if isinstance(x, str) else x)


In [64]:
Danny_Labels

Unnamed: 0,Filename,Time Start,Time End,Behavior
0,18927100,00:27:30,00:29:30,"phlebotomy draw, discomfort"
1,18927200,00:42:51,00:42:58,"grimace, discomfort"
2,18927200,00:43:15,00:43:46,"grimace, discomfort"
3,18927200,00:50:03,00:50:07,"grimace, discomfort"
4,18927R00,00:25:25,00:25:27,"said ow, starting to hurt, discomfort"
5,18927A00,00:05:16,00:05:53,"phlebotomy draw, discomfort"
6,18927Y00,00:14:13,00:14:55,"phlebotomy draw, discomfort"
7,18928N00,00:27:02,00:27:36,"phlebotomy draw, discomfort"
8,18928N00,00:27:37,00:27:42,smile with chuckle
9,18929B00,00:31:58,00:33:32,"phlebotomy draw, discomfort"


In [65]:
def filter_df_by_behavior(df, desired_string):
    # Create a copy of the DataFrame
    filtered_df = df.copy()

    # Filter the DataFrame based on the desired string within 'Behavior' column
    filtered_df = filtered_df[filtered_df['Behavior'].str.contains(desired_string)]

    # Reset the index of the filtered DataFrame
    filtered_df = filtered_df.reset_index(drop=True)

    # Return the filtered DataFrame
    return filtered_df

In [66]:
import datetime


def add_time_strings(t1, t2):
    total_seconds = sum(x.total_seconds() for x in [pd.to_timedelta(t) for t in [t1, t2]])
    return str(pd.to_timedelta(total_seconds, unit='s'))


def convert_time(df1, df2):
    if df1.empty:
        return df1

    modified_df = df1.copy()

    filename_to_videostart = dict(zip(df2['Filename'], df2['VideoStart']))

    def handle_time_conversion(row, time_field):
        video_start = filename_to_videostart.get(str(row['Filename']), None)
        if video_start is None:
            return None

        try:
            # Get the base date from the video start datetime.
            base_date = video_start.date() if isinstance(video_start, datetime.datetime) else pd.to_datetime(video_start).date()
        except Exception:
            return None

        time_value = row[time_field]
        if time_value is None:
            return None

        try:
            # Convert the time value to a string and then to a timedelta.
            time_str = time_value.strftime('%H:%M:%S') if isinstance(time_value, datetime.time) else time_value
            final_timedelta = pd.to_timedelta(time_str)
            # Create a complete datetime object using the base date and the time from the timedelta.
            final_time = pd.Timestamp(base_date) + final_timedelta
            return final_time
        except Exception:
            return None

    
    # Apply time conversion and maintain the output as Timestamps.
    modified_df['Time Start'] = modified_df.apply(lambda row: handle_time_conversion(row, 'Time Start'), axis=1)
    modified_df['Time End'] = modified_df.apply(lambda row: handle_time_conversion(row, 'Time End'), axis=1)

    return modified_df





In [67]:
import pandas as pd

def buffer_neither(smiles_df, sleep_df):
    # Convert time columns to datetime if not already
    smiles_df['Time Start'] = pd.to_datetime(smiles_df['Time Start'], errors='coerce')
    smiles_df['Time End'] = pd.to_datetime(smiles_df['Time End'], errors='coerce')

    if not sleep_df.empty:
        sleep_df['Time Start'] = pd.to_datetime(sleep_df['Time Start'], errors='coerce')
        sleep_df['Time End'] = pd.to_datetime(sleep_df['Time End'], errors='coerce')

    # Drop rows with NaT values in smiles dataframe
    smiles_df = smiles_df.dropna(subset=['Time Start', 'Time End'])

    # Check if sleep dataframe is not empty and drop rows with NaT values
    if not sleep_df.empty:
        sleep_df = sleep_df.dropna(subset=['Time Start', 'Time End'])

    # If both dataframes are empty
    if smiles_df.empty and (sleep_df.empty or sleep_df is None):
        return pd.DataFrame(columns=['Time'])  # Return empty dataframe if no data available

    # Define ranges using non-empty dataframe(s)
    start_times = pd.Series(smiles_df['Time Start'].tolist() + (sleep_df['Time Start'].tolist() if not sleep_df.empty else [])).dropna()
    end_times = pd.Series(smiles_df['Time End'].tolist() + (sleep_df['Time End'].tolist() if not sleep_df.empty else [])).dropna()

    if start_times.empty or end_times.empty:
        return pd.DataFrame(columns=['Time'])  # Return empty dataframe if no valid times are available

    # Find the earliest and latest times
    start_time = start_times.min()
    end_time = end_times.max()

    # Create a DataFrame with fixed frequency for the time range
    time_range = pd.date_range(start=start_time, end=end_time, freq='10S')
    tracking_df = pd.DataFrame({'Time': time_range, 'BufferSafe': True})

    # Set BufferSafe status based on proximity to smile and sleep events
    for i in range(len(tracking_df)):
        time = tracking_df.loc[i, 'Time']
        buffer_before = time - pd.Timedelta(minutes=1)
        buffer_after = time + pd.Timedelta(minutes=1)

        has_smile_within_buffer = smiles_df[((smiles_df['Time Start'] <= buffer_after) & (smiles_df['Time End'] >= buffer_before))].shape[0] > 0
        has_sleep_within_buffer = False if sleep_df.empty else (sleep_df[((sleep_df['Time Start'] <= buffer_after) & (sleep_df['Time End'] >= buffer_before))].shape[0] > 0)
        tracking_df.loc[i, 'BufferSafe'] = not (has_smile_within_buffer or has_sleep_within_buffer)

    return tracking_df[tracking_df['BufferSafe']]['Time'].reset_index(drop=True)


def create_event_detection_df(smiles_df, safe_series):
    # Create a new DataFrame for event detection
    event_detection_df = pd.DataFrame(columns=['Datetime', 'EventDetected'])
    # Iterate over each row in the smiles_df
    for index, row in smiles_df.iterrows():
        start_time = row['Time Start']
        end_time = row['Time End']

        # Generate a range of timestamps at a frequency of 1 second
        timestamps = pd.date_range(start=start_time, end=end_time, freq='S', inclusive='right')

        # Add each timestamp as a separate row to the event_detection_df
        for timestamp in timestamps:
            event_detection_df = pd.concat([event_detection_df, pd.DataFrame.from_records([{'Datetime': timestamp, 'EventDetected': 1}])], ignore_index=True)

    # Get the length of the smile event DataFrame
    num_smiles = len(event_detection_df)

    num_of_each_class = min(len(safe_series), num_smiles)
    
    # Randomly sample from the buffer safe Series
    sampled_safe_series = safe_series.sample(n=num_of_each_class, replace=False)

    # Randomly sample from the event detection df
    event_detection_df = event_detection_df.sample(n=num_of_each_class, replace=False).reset_index(drop=True)
    
    # Add nonsmile nonsleep events to the DataFrame
    nonsmile_nonsleep_times = sampled_safe_series.reset_index(drop=True)
    nonsmile_nonsleep_df = pd.DataFrame({'Datetime': nonsmile_nonsleep_times, 'EventDetected': 0})
    event_detection_df = pd.concat([event_detection_df, nonsmile_nonsleep_df], ignore_index=True)

    # Sort the DataFrame by DateTime in ascending order
    event_detection_df = event_detection_df.sort_values(by='Datetime').reset_index(drop=True)

    return event_detection_df

def get_labels(smile_string, sleep_string):
  # gets us our labels df (DateTime and EventDetected columns)
  # note: doesn't need to be smiles. Replace 'smile' with any other event as first arg.

  # smile string is what we want to detect
  # sleep string is what we label as neither smile nor non-smile
  # i.e. if a time period is labeled as sleep, exclude from dataset

  # Make sure Danny_Labels and df_videoTimestamps have been loaded in already!

  Smile_Labels = filter_df_by_behavior(Danny_Labels, smile_string)
  Sleep_Labels = filter_df_by_behavior(Danny_Labels, sleep_string)
  
    
  smiles_df = convert_time(Smile_Labels, df_videoTimestamps)
  sleep_df = convert_time(Sleep_Labels, df_videoTimestamps)
  

  non_smile_non_sleep_times = buffer_neither(smiles_df, sleep_df)

  # We need at least 5 events (probably even more) to be able to train a model
  if len(smiles_df) < 5:
      return pd.DataFrame()

  return create_event_detection_df(smiles_df, non_smile_non_sleep_times)




In [68]:
# NOTE: The events are defined such that we label the END time and time window looks at 1 s preceding

# Eventually, we should run this 500 times to get range of AUROC

Final_Smile_Labels = get_labels('smile', 'sleep')

In [69]:
Final_Smile_Labels

Unnamed: 0,Datetime,EventDetected
0,2021-02-22 00:27:38,1
1,2021-02-22 00:27:39,1
2,2021-02-22 00:27:40,1
3,2021-02-22 00:27:41,1
4,2021-02-22 00:27:42,1
...,...,...
139,2021-03-01 00:13:47,1
140,2021-03-01 00:13:48,1
141,2021-03-01 00:13:49,1
142,2021-03-01 00:13:50,1


In [70]:
Final_Yawn_Labels = get_labels('yawn', 'sleep')

In [71]:
Final_Yawn_Labels

Unnamed: 0,Datetime,EventDetected
0,2021-02-26 00:02:02,1
1,2021-02-26 00:02:03,1
2,2021-02-26 00:02:04,1
3,2021-02-26 00:02:05,1
4,2021-02-26 00:02:06,1
...,...,...
83,2021-03-01 00:14:03,1
84,2021-03-01 00:14:04,1
85,2021-03-01 00:14:05,1
86,2021-03-01 00:14:06,1


In [72]:
Final_Discomfort_Labels = get_labels('discomfort', 'sleep')

In [73]:
Final_Discomfort_Labels

Unnamed: 0,Datetime,EventDetected
0,2021-02-19 00:27:31,1
1,2021-02-19 00:27:32,1
2,2021-02-19 00:27:33,1
3,2021-02-19 00:27:34,1
4,2021-02-19 00:27:35,1
...,...,...
1725,2021-03-01 00:12:20,1
1726,2021-03-01 00:12:21,1
1727,2021-03-01 00:12:22,1
1728,2021-03-01 00:12:23,1


In [74]:
Final_Sad_Labels = get_labels('sad', 'sleep')

In [75]:
Final_Sad_Labels

# OpenFace

In [81]:
# DICTIONARY OF SEPARATE DFS

def get_dict_openface(output_dir):
    # Create an empty dictionary to hold the DataFrames
    dfs_openface = {}

    # Get a list of all the CSV files in the directory
    csv_files = sorted([f for f in os.listdir(output_dir) if f.endswith('.csv')])

    # list of columns to keep, assuming they may have variable spaces
    columns_to_keep = ['frame', 'timestamp', 'success', 'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 
                       'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 
                       'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 
                       'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU45_c']

    # Loop through the CSV files
    for csv_file in csv_files:
        # Load data into a pandas DataFrame
        csv_file_path = os.path.join(output_dir, csv_file)
        df_temp = pd.read_csv(csv_file_path)

        # Fix column names to not have leading or trailing spaces
        df_temp.columns = df_temp.columns.str.strip()

        # Keep every 6th row such that it's 5 fps!
        X = 6
        df_temp = df_temp[df_temp.index % X == 0]

        # Filter DataFrame to keep only columns in list, now that names are stripped
        df_temp = df_temp.loc[:, [col for col in columns_to_keep if col in df_temp.columns]]

        # Store the DataFrame in the dictionary with the csv file name as the key
        # remove the '.csv' by doing csv_file[:-4]
        dfs_openface[csv_file[:-4]] = df_temp
        del df_temp

    return dfs_openface


def only_successful_frames(df):
    # get frames where AU/emotion detection was successful!
    return df[df['success'] == 1]

def apply_function_to_dict(dictionary, func, **kwargs):
    """
    Apply a function to each DataFrame in a dictionary and return a modified copy of the dictionary.

    Args:
        dictionary (dict): The dictionary containing DataFrames.
        func (function): The function to apply to each DataFrame.
        **kwargs: Additional keyword arguments to pass to the function.

    Returns:
        dict: A modified copy of the dictionary with the function applied to each DataFrame.
    """
    return {key: func(df, **kwargs) for key, df in dictionary.items()}

In [None]:
dfs_openface = get_dict_openface(OPENFACE_OUTPUT_DIRECTORY)
dfs_openface = apply_function_to_dict(dfs_openface, only_successful_frames)

In [None]:
# SAVE THE OPENFACE DICTIONARY

save_var(dfs_openface, forced_name=f'dfs_openface_{PAT_SHORT_NAME}')

In [None]:
# # LOAD THE OPENFACE DICTIONARY

# dfs_openface = load_var(f'dfs_openface_{PAT_SHORT_NAME}')


# HSEmotion & OpenGraphAU

In [None]:
import pandas as pd
import os

def get_dict(output_dir, file_now='outputs_hse.csv', filterOutLR=True):

  # Initialize an empty dictionary to store the dataframes
  df_dict = {}

  # Loop through the subfolders in alphabetical order
  for subfolder_name in sorted(os.listdir(output_dir)):

    # Check if the subfolder contains CSV files
    subfolder_path = os.path.join(output_dir, subfolder_name)
    if not os.path.isdir(subfolder_path):
      continue

    # Load the first CSV file in the subfolder into a dataframe
    csv_file_path = os.path.join(subfolder_path, file_now)
    if not os.path.isfile(csv_file_path):
      continue

    try:
      df_temp = pd.read_csv(csv_file_path)
    except:
      df_temp = pd.DataFrame(columns=['frame', 'timestamp', 'success', 'AU1', 'AU2', 'AU4', 'AU5', 'AU6', 'AU7', 'AU9',
       'AU10', 'AU11', 'AU12', 'AU13', 'AU14', 'AU15', 'AU16', 'AU17', 'AU18',
       'AU19', 'AU20', 'AU22', 'AU23', 'AU24', 'AU25', 'AU26', 'AU27', 'AU32',
       'AU38', 'AU39'])


    # OpenGraphAU - we are filtering out L and R!
    if filterOutLR:
      df_temp = df_temp.filter(regex='^(?!AUL|AUR)')

    # Add the dataframe to the dictionary with the subfolder name as the key
    # We do [:-4] to remove '.mp4' from the end of the string
    df_dict[subfolder_name[:-4]] = df_temp

  return df_dict

def create_binary_columns(df, threshold):
    df_copy = df.copy()
    # adds classification columns to opengraphAU
    for col in df_copy.columns:
        if col.startswith('AU'):
            # Add _c to the column name for the new column
            new_col_name = col + '_c'
            # Apply the binary classification to the new column
            df_copy[new_col_name] = df_copy[col].apply(lambda x: 1 if x >= threshold else 0)
            # Add _r to the original column name
            df_copy = df_copy.rename(columns={col: col + '_r'}, inplace=False)
    return df_copy

def remove_columns_ending_with_r(df):
    columns_to_drop = [col for col in df.columns if col.endswith('_r')]
    df = df.drop(columns=columns_to_drop, inplace=False)
    return df


def only_successful_frames(df):
    # get frames where AU/emotion detection was successful!
    return df[df['success'] == 1]


def apply_function_to_dict(dictionary, func, **kwargs):
    """
    Apply a function to each DataFrame in a dictionary and return a modified copy of the dictionary.

    Args:
        dictionary (dict): The dictionary containing DataFrames.
        func (function): The function to apply to each DataFrame.
        **kwargs: Additional keyword arguments to pass to the function.

    Returns:
        dict: A modified copy of the dictionary with the function applied to each DataFrame.
    """
    return {key: func(df, **kwargs) for key, df in dictionary.items()}

In [None]:
dfs_hsemotion = get_dict(COMBINED_OUTPUT_DIRECTORY, file_now='outputs_hse.csv')
dfs_hsemotion = apply_function_to_dict(dfs_hsemotion, only_successful_frames)

## Smile, Yawn, Etc.

In [None]:
dfs_opengraphau = get_dict(COMBINED_OUTPUT_DIRECTORY, file_now='outputs_ogau.csv')
dfs_opengraphau = apply_function_to_dict(dfs_opengraphau, only_successful_frames)


In [None]:
# SAVE DF HSEMOTION
save_var(dfs_hsemotion, forced_name=f'dfs_hsemotion_{PAT_SHORT_NAME}')


In [None]:
# SAVE DF OPENGRAPHAU WITHOUT ANY THRESHOLDING
save_var(dfs_opengraphau, forced_name=f'dfs_opengraphau_smile_{PAT_SHORT_NAME}')

In [None]:
# # LOAD VARS FOR BEHAVIOR PREDICTION
# # NO THRESHOLDING FOR DF OPENGRAPHAU

# dfs_hsemotion = load_var(f'dfs_hsemotion_{PAT_SHORT_NAME}')

# dfs_opengraphau = load_var(f'dfs_opengraphau_smile_{PAT_SHORT_NAME}')

# Select Specific Times

## Setup

In [None]:
def get_data_within_duration(dfs_dict, df_video_timestamps, datetime, duration):
    # Takes in:
    # dfs_dict -- a dictionary of dataframes containing csv data from one of the pipelines
    # df_video_timestamps -- the VideoDateTimes_199 csv
    # datetime -- a pd.datetime value to center our extraction
    # duration -- a duration (in minutes) BEFORE the datetime to extract

    # Outputs:
    # One dataframe with all rows we want, with timestamps converted into correct datetimes
    start_datetime = datetime - pd.Timedelta(minutes=duration)
    end_datetime = datetime

    relevant_keys = df_video_timestamps.loc[(pd.to_datetime(df_video_timestamps['VideoEnd']) >= start_datetime) &
                                            (pd.to_datetime(df_video_timestamps['VideoStart']) <= end_datetime), 'Filename'].values

    relevant_dfs = []
    for key in relevant_keys:
        if key in dfs_dict:
            video_start = pd.to_datetime(df_video_timestamps.loc[df_video_timestamps['Filename'] == key, 'VideoStart'].values[0])
            video_end = pd.to_datetime(df_video_timestamps.loc[df_video_timestamps['Filename'] == key, 'VideoEnd'].values[0])
            time_mask = ((dfs_dict[key]['timestamp'] >= (start_datetime - video_start).total_seconds()) &
                         (dfs_dict[key]['timestamp'] <= (end_datetime - video_start).total_seconds()))
            df = dfs_dict[key].loc[time_mask].copy()
            df['timestamp'] = video_start + pd.to_timedelta(df['timestamp'], unit='s')
            relevant_dfs.append(df)

    if relevant_dfs:
        df_combined = pd.concat(relevant_dfs, ignore_index=True, sort=False)
        df_combined = df_combined.drop(columns='frame')

        return df_combined

    print(f"MAJOR ERROR! ZERO RELEVANT DFS!! DATETIME: {datetime}")
    return pd.DataFrame()

def get_radius_dict(TIME_RADIUS_IN_MINUTES, INPUT_DF, df_videoTimestamps, df_moodTracking, takeAll=True):
  # takes in the:
  # --time radius,
  # --input dataframe dict (e.g. is it from OpenFace? HSEmotion?)
  # --df with video timestamps
  # --df with mood tracking patient reports
  # --takeAll - are we taking all reports, or filtering out values w/o mood (e.g. anxiety)? True = no filtering

  # returns dictionary of timestamp : df with relevant frames

  # We'll make a dictionary, with the relevant df for each datetime we have a report
  radius_df_dict = {}
  for oneIndex in range(len(df_moodTracking)):
    # Let's make sure there's a value collected (or takeAll = True)!
    if takeAll:
      dt_now = get_moodTracking_datetime(oneIndex, df_moodTracking=df_moodTracking)
      filtered_df = get_data_within_duration(INPUT_DF, df_videoTimestamps, dt_now, TIME_RADIUS_IN_MINUTES)
      radius_df_dict[dt_now] = filtered_df
    else:
      val_now = df_moodTracking[oneIndex:oneIndex+1]['Anxiety'][oneIndex]
      if isinstance(val_now, str):
        # Value was collected
        dt_now = get_moodTracking_datetime(oneIndex, df_moodTracking=df_moodTracking)
        filtered_df = get_data_within_duration(INPUT_DF, df_videoTimestamps, dt_now, TIME_RADIUS_IN_MINUTES)
        radius_df_dict[dt_now] = filtered_df
      else:
        # No value collected!
        print('No value for Anxiety for index ', oneIndex, f'corresponding to {get_moodTracking_datetime(oneIndex, df_moodTracking=df_moodTracking)}')
  return radius_df_dict

def generate_number_list(start, interval, count):
    number_list = [start + i * interval for i in range(count)]
    return number_list

def get_moodTracking_datetime(index, df_moodTracking):
  temp_var = pd.to_datetime(pd.to_datetime(df_moodTracking[index:index+1]['Datetime']).dt.strftime('%d-%b-%Y %H:%M:%S'))
  return pd.Timestamp(temp_var[index])

# LogReg Mapping (Smile, Yawn, etc.)

## Prep

In [None]:
def only_successful_frames(df):
    # get frames where AU/emotion detection was successful!
    return df[df['success'] == 1]


def clean_data(pipeline_emotion, labels):
    # Convert the Datetime column to datetime objects for comparison
    labels['Datetime'] = pd.to_datetime(labels['Datetime'])

    # Create a list to store keys to be removed
    keys_to_remove = []

    # Iterate through the pipeline_emotion dictionary
    for key, df in pipeline_emotion.items():
        # Check if the dataframe is empty
        if df.empty:
            # Add the key to keys_to_remove list
            keys_to_remove.append(key)

    # Remove empty dataframes from pipeline_emotion
    for key in keys_to_remove:
        del pipeline_emotion[key]

    # Remove the relevant rows from labels
    labels = labels[~labels['Datetime'].isin(keys_to_remove)]

    return pipeline_emotion, labels

def preprocess_df_radius_dict(df_radius_dict, labels_now, columns_to_keep):
  # Takes only successful frames
  # Chooses specific columns from each df to keep

  df_radius_dict_clean, labels_now_clean = clean_data(df_radius_dict, labels_now)

  new_radius_dict = {}
  for key1, one_time_df in df_radius_dict_clean.items():
    success_df = only_successful_frames(one_time_df)
    new_radius_dict[key1] = success_df.loc[:, columns_to_keep]

  return new_radius_dict

In [None]:
def shuffle_labels(df):
    """
    Takes in a DataFrame, makes a copy of it, and randomly shuffles the 'EventDetected' labels.

    Args:
        df (pd.DataFrame): DataFrame containing 'EventDetected' column.

    Returns:
        shuffled_df (pd.DataFrame): A copy of the original DataFrame with shuffled 'EventDetected' column.
    """
    if df.empty:
        return df
    shuffled_df = df.copy()
    shuffled_df['EventDetected'] = df['EventDetected'].sample(frac=1, random_state=42).reset_index(drop=True)
    return shuffled_df

In [None]:
Shuffled_Smile_Labels = shuffle_labels(Final_Smile_Labels)

In [None]:
Shuffled_Yawn_Labels = shuffle_labels(Final_Yawn_Labels)

In [None]:
Shuffled_Discomfort_Labels = shuffle_labels(Final_Discomfort_Labels)

In [None]:
Shuffled_Sad_Labels = shuffle_labels(Final_Sad_Labels)

## Behaviors

In [None]:
# SHORT EVENT DETECTION (SMILE, ETC.)
takeAll = True # we are taking all patient reports

# start and interval are in minutes
# example: 0.01666 is 1 second
TIME_RADIUS_LIST = [0.01666] # JUST one second

openface_radius_dict = {}
hsemotion_radius_dict = {}
opengraphau_radius_dict = {}

DANNY_LABELS_NOW = Final_Smile_Labels

if not(DANNY_LABELS_NOW.empty):
    
    for i in TIME_RADIUS_LIST:
      openface_radius_now = get_radius_dict(i, dfs_openface, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
      hsemotion_radius_now = get_radius_dict(i, dfs_hsemotion, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
      opengraphau_radius_now = get_radius_dict(i, dfs_opengraphau, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
    
      openface_radius_dict[f'{i}'] = openface_radius_now
      hsemotion_radius_dict[f'{i}'] = hsemotion_radius_now
      opengraphau_radius_dict[f'{i}'] = opengraphau_radius_now



In [None]:
# SAVE VARIABLES - Smile
if not(DANNY_LABELS_NOW.empty):
    save_var(openface_radius_dict, forced_name=f'openface_radius_dict_smile_1_{PAT_SHORT_NAME}')
    
    save_var(hsemotion_radius_dict, forced_name=f'hsemotion_radius_dict_smile_1_{PAT_SHORT_NAME}')
    
    save_var(opengraphau_radius_dict, forced_name=f'opengraphau_radius_dict_smile_1_{PAT_SHORT_NAME}')
    
    save_var(Final_Smile_Labels, forced_name=f'Final_Smile_Labels_{PAT_SHORT_NAME}')


In [None]:
if not(DANNY_LABELS_NOW.empty):
    openface_radius_dict = openface_radius_dict['0.01666']
    hsemotion_radius_dict = hsemotion_radius_dict['0.01666']
    opengraphau_radius_dict = opengraphau_radius_dict['0.01666']


In [None]:
if not(DANNY_LABELS_NOW.empty):
        
    # SMILE
    openface_smile = preprocess_df_radius_dict(openface_radius_dict, Final_Smile_Labels,
                                                    ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r',
           'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r',
           'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c',
           'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c',
           'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c',
           'AU26_c', 'AU45_c'])
    
    opengraphau_smile = preprocess_df_radius_dict(opengraphau_radius_dict, Final_Smile_Labels,
                                                       ['AU1', 'AU2', 'AU4', 'AU5', 'AU6', 'AU7', 'AU9',
           'AU10', 'AU11', 'AU12', 'AU13', 'AU14', 'AU15', 'AU16', 'AU17', 'AU18',
           'AU19', 'AU20', 'AU22', 'AU23', 'AU24', 'AU25', 'AU26', 'AU27', 'AU32',
           'AU38', 'AU39'])
    
    hsemotion_smile = preprocess_df_radius_dict(hsemotion_radius_dict, Final_Smile_Labels,
                                                     ['Happiness'])


In [None]:
# SHORT EVENT DETECTION (SMILE, ETC.)
takeAll = True # we are taking all patient reports

# start and interval are in minutes
# example: 0.01666 is 1 second
TIME_RADIUS_LIST = [0.01666] # JUST one second

openface_radius_dict = {}
hsemotion_radius_dict = {}
opengraphau_radius_dict = {}

DANNY_LABELS_NOW = Final_Yawn_Labels

if not(DANNY_LABELS_NOW.empty):
    for i in TIME_RADIUS_LIST:
      openface_radius_now = get_radius_dict(i, dfs_openface, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
      hsemotion_radius_now = get_radius_dict(i, dfs_hsemotion, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
      opengraphau_radius_now = get_radius_dict(i, dfs_opengraphau, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
    
      openface_radius_dict[f'{i}'] = openface_radius_now
      hsemotion_radius_dict[f'{i}'] = hsemotion_radius_now
      opengraphau_radius_dict[f'{i}'] = opengraphau_radius_now



In [None]:
if not(DANNY_LABELS_NOW.empty):
        
    # SAVE VARIABLES - Yawn
    
    save_var(openface_radius_dict, forced_name=f'openface_radius_dict_yawn_1_{PAT_SHORT_NAME}')
    
    save_var(hsemotion_radius_dict, forced_name=f'hsemotion_radius_dict_yawn_1_{PAT_SHORT_NAME}')
    
    save_var(opengraphau_radius_dict, forced_name=f'opengraphau_radius_dict_yawn_1_{PAT_SHORT_NAME}')
    
    save_var(Final_Yawn_Labels, forced_name=f'Final_Yawn_Labels_{PAT_SHORT_NAME}')


In [None]:
if not(DANNY_LABELS_NOW.empty):
    
    openface_radius_dict = openface_radius_dict['0.01666']
    hsemotion_radius_dict = hsemotion_radius_dict['0.01666']
    opengraphau_radius_dict = opengraphau_radius_dict['0.01666']


In [None]:
if not(DANNY_LABELS_NOW.empty):
        
    # YAWN
    openface_yawn = preprocess_df_radius_dict(openface_radius_dict, Final_Yawn_Labels,
                                                    ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r',
           'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r',
           'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c',
           'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c',
           'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c',
           'AU26_c', 'AU45_c'])
    
    opengraphau_yawn = preprocess_df_radius_dict(opengraphau_radius_dict, Final_Yawn_Labels,
                                                       ['AU1', 'AU2', 'AU4', 'AU5', 'AU6', 'AU7', 'AU9',
           'AU10', 'AU11', 'AU12', 'AU13', 'AU14', 'AU15', 'AU16', 'AU17', 'AU18',
           'AU19', 'AU20', 'AU22', 'AU23', 'AU24', 'AU25', 'AU26', 'AU27', 'AU32',
           'AU38', 'AU39'])
    
    hsemotion_yawn = preprocess_df_radius_dict(hsemotion_radius_dict, Final_Yawn_Labels,
                                                     ['Anger', 'Disgust', 'Fear', 'Happiness',
           'Neutral', 'Sadness', 'Surprise'])


In [None]:
# SHORT EVENT DETECTION (SMILE, ETC.)
takeAll = True # we are taking all patient reports

# start and interval are in minutes
# example: 0.01666 is 1 second
TIME_RADIUS_LIST = [0.01666] # JUST one second

openface_radius_dict = {}
hsemotion_radius_dict = {}
opengraphau_radius_dict = {}

DANNY_LABELS_NOW = Final_Discomfort_Labels

if not(DANNY_LABELS_NOW.empty):
    for i in TIME_RADIUS_LIST:
      openface_radius_now = get_radius_dict(i, dfs_openface, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
      hsemotion_radius_now = get_radius_dict(i, dfs_hsemotion, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
      opengraphau_radius_now = get_radius_dict(i, dfs_opengraphau, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
    
      openface_radius_dict[f'{i}'] = openface_radius_now
      hsemotion_radius_dict[f'{i}'] = hsemotion_radius_now
      opengraphau_radius_dict[f'{i}'] = opengraphau_radius_now



In [None]:
if not(DANNY_LABELS_NOW.empty):
        
    # SAVE VARIABLES - Discomfort
    
    save_var(openface_radius_dict, forced_name=f'openface_radius_dict_discomfort_1_{PAT_SHORT_NAME}')
    
    save_var(hsemotion_radius_dict, forced_name=f'hsemotion_radius_dict_discomfort_1_{PAT_SHORT_NAME}')
    
    save_var(opengraphau_radius_dict, forced_name=f'opengraphau_radius_dict_discomfort_1_{PAT_SHORT_NAME}')
    
    save_var(Final_Discomfort_Labels, forced_name=f'Final_Discomfort_Labels_{PAT_SHORT_NAME}')


In [None]:
if not(DANNY_LABELS_NOW.empty):
    
    openface_radius_dict = openface_radius_dict['0.01666']
    hsemotion_radius_dict = hsemotion_radius_dict['0.01666']
    opengraphau_radius_dict = opengraphau_radius_dict['0.01666']


In [None]:
if not(DANNY_LABELS_NOW.empty):
        
    # DISCOMFORT
    openface_discomfort = preprocess_df_radius_dict(openface_radius_dict, Final_Discomfort_Labels,
                                                    ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r',
           'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r',
           'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c',
           'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c',
           'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c',
           'AU26_c', 'AU45_c'])
    opengraphau_discomfort = preprocess_df_radius_dict(opengraphau_radius_dict, Final_Discomfort_Labels,
                                                       ['AU1', 'AU2', 'AU4', 'AU5', 'AU6', 'AU7', 'AU9',
           'AU10', 'AU11', 'AU12', 'AU13', 'AU14', 'AU15', 'AU16', 'AU17', 'AU18',
           'AU19', 'AU20', 'AU22', 'AU23', 'AU24', 'AU25', 'AU26', 'AU27', 'AU32',
           'AU38', 'AU39'])
    hsemotion_discomfort = preprocess_df_radius_dict(hsemotion_radius_dict, Final_Discomfort_Labels,
                                                     ['Anger', 'Disgust', 'Fear', 'Happiness',
           'Neutral', 'Sadness', 'Surprise'])

In [None]:
# SHORT EVENT DETECTION (SMILE, ETC.)
takeAll = True # we are taking all patient reports

# start and interval are in minutes
# example: 0.01666 is 1 second
TIME_RADIUS_LIST = [0.01666] # JUST one second

openface_radius_dict = {}
hsemotion_radius_dict = {}
opengraphau_radius_dict = {}

DANNY_LABELS_NOW = Final_Sad_Labels

if not(DANNY_LABELS_NOW.empty):
    
    for i in TIME_RADIUS_LIST:
      openface_radius_now = get_radius_dict(i, dfs_openface, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
      hsemotion_radius_now = get_radius_dict(i, dfs_hsemotion, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
      opengraphau_radius_now = get_radius_dict(i, dfs_opengraphau, df_videoTimestamps, DANNY_LABELS_NOW, takeAll=takeAll)
    
      openface_radius_dict[f'{i}'] = openface_radius_now
      hsemotion_radius_dict[f'{i}'] = hsemotion_radius_now
      opengraphau_radius_dict[f'{i}'] = opengraphau_radius_now
    


In [None]:
if not(DANNY_LABELS_NOW.empty):
    
    # SAVE VARIABLES - Sad
    
    save_var(openface_radius_dict, forced_name=f'openface_radius_dict_sad_1_{PAT_SHORT_NAME}')
    
    save_var(hsemotion_radius_dict, forced_name=f'hsemotion_radius_dict_sad_1_{PAT_SHORT_NAME}')
    
    save_var(opengraphau_radius_dict, forced_name=f'opengraphau_radius_dict_sad_1_{PAT_SHORT_NAME}')
    
    save_var(Final_Sad_Labels, forced_name=f'Final_Sad_Labels_{PAT_SHORT_NAME}')


In [None]:
if not(DANNY_LABELS_NOW.empty):
    openface_radius_dict = openface_radius_dict['0.01666']
    hsemotion_radius_dict = hsemotion_radius_dict['0.01666']
    opengraphau_radius_dict = opengraphau_radius_dict['0.01666']


In [None]:
if not(DANNY_LABELS_NOW.empty):
    # SAD
    openface_sad = preprocess_df_radius_dict(openface_radius_dict, Final_Sad_Labels,
                                                    ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r',
           'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r',
           'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c',
           'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c',
           'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c',
           'AU26_c', 'AU45_c'])
    opengraphau_sad = preprocess_df_radius_dict(opengraphau_radius_dict, Final_Sad_Labels,
                                                       ['AU1', 'AU2', 'AU4', 'AU5', 'AU6', 'AU7', 'AU9',
           'AU10', 'AU11', 'AU12', 'AU13', 'AU14', 'AU15', 'AU16', 'AU17', 'AU18',
           'AU19', 'AU20', 'AU22', 'AU23', 'AU24', 'AU25', 'AU26', 'AU27', 'AU32',
           'AU38', 'AU39'])
    hsemotion_sad = preprocess_df_radius_dict(hsemotion_radius_dict, Final_Sad_Labels,
                                                     ['Sadness'])

# Deal with missing variables

In [None]:
if len(Final_Smile_Labels) == 0:
    openface_smile = 0
    opengraphau_smile = 0
    hsemotion_smile = 0

if len(Final_Discomfort_Labels) == 0:
    openface_discomfort = 0
    opengraphau_discomfort = 0
    hsemotion_discomfort = 0

if len(Final_Yawn_Labels) == 0:
    openface_yawn = 0
    opengraphau_yawn = 0
    hsemotion_yawn = 0

if len(Final_Sad_Labels) == 0:
    openface_sad = 0
    opengraphau_sad = 0
    hsemotion_sad = 0

## Func: Train + Eval (5-Fold CV)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, f1_score, average_precision_score
import matplotlib.pyplot as plt
from IPython.display import display
from scipy import interp


def train_and_evaluate(smile_dict, Final_Smile_Labels, pipeline_name='OpenFace', results_path=RESULTS_PATH_BASE + 'Smile/'):
    # If we don't have any events, can't train and eval a model!
    if Final_Smile_Labels.empty:
        return pd.DataFrame(), 0, 0
    
    # Averaging values across rows for each DataFrame in smile_dict
    averaged_values = {timestamp: df.mean() for timestamp, df in smile_dict.items()}

    # Convert the dictionary to a DataFrame
    averaged_df = pd.DataFrame.from_dict(averaged_values, orient='index', columns=smile_dict[next(iter(smile_dict))].columns)

    # Merge with Final_Smile_Labels
    merged_df = pd.merge(Final_Smile_Labels, averaged_df, left_on='Datetime', right_index=True)

    merged_df.fillna(0, inplace=True)

    # Split features and labels
    X = merged_df.drop(['Datetime', 'EventDetected'], axis=1)
    y = merged_df['EventDetected']
    y = y.astype('int')

    if len(np.unique(y)) < 2:
        return pd.DataFrame(), 0, 0 

    # Initialize StratifiedKFold and LogisticRegression
    NUMBER_OF_FOLDS = 5
    skf = StratifiedKFold(n_splits=NUMBER_OF_FOLDS, shuffle=True, random_state=42)
    log_reg = LogisticRegression()

    # Lists to hold metrics and ROC curve values across folds
    auroc_list, accuracy_list, f1_list, auprc_list = [], [], [], []
    mean_fpr = np.linspace(0, 1, 100)  # Common grid of FPR values for averaging the ROC curves
    tprs = []  # List to hold the TPR values for each fold

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        log_reg.fit(X_train, y_train)
        y_pred = log_reg.predict(X_test)
        y_proba = log_reg.predict_proba(X_test)[:, 1]

        # Metrics computation
        auroc_list.append(roc_auc_score(y_test, y_proba))
        accuracy_list.append(accuracy_score(y_test, y_pred))
        f1_list.append(f1_score(y_test, y_pred))
        auprc_list.append(average_precision_score(y_test, y_proba))

        # ROC Curve values for the current fold
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        tprs.append(interp(mean_fpr, fpr, tpr))  # Interpolate the TPR values to the common grid of FPR values


    # Compute the mean TPR values at each FPR to get the "averaged" ROC curve
    mean_tpr = np.mean(tprs, axis=0)

    # Plotting the curves
    for tpr in tprs:
      plt.plot(mean_fpr, tpr, color='b', alpha=0.1)  # Plot each fold's ROC curve with a light color
    plt.plot(mean_fpr, mean_tpr, color='b', linewidth=2)  # Plot the "averaged" ROC curve in bold
    plt.title(f'{pipeline_name} 5-Fold CV LogReg')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    # Save the figure to a file
    os.makedirs(results_path, exist_ok=True)
    output_file_path = os.path.join(results_path, f'{pipeline_name} roc_curve.png')
    plt.savefig(output_file_path)

    plt.show()

    # Reporting metrics
    metrics_df = pd.DataFrame({
        'Fold': list(range(1, NUMBER_OF_FOLDS + 1)) + ['Average'],
        'Accuracy': accuracy_list + [np.mean(accuracy_list)],
        'F1 Score': f1_list + [np.mean(f1_list)],
        'AUPRC': auprc_list + [np.mean(auprc_list)],
        'AUROC': auroc_list + [np.mean(auroc_list)]
    })

    # Specify the path for the CSV file
    csv_file_path = os.path.join(results_path, f'{pipeline_name} metrics.csv')

    # Save the DataFrame to a CSV file
    metrics_df.to_csv(csv_file_path, index=False)


    return metrics_df, mean_fpr, mean_tpr



## Smile Results: Train + Eval (5-Fold CV)

In [None]:
of_metrics_df, of_mean_fpr, of_mean_tpr = train_and_evaluate(openface_smile, Final_Smile_Labels, pipeline_name='OpenFace', results_path=RESULTS_PATH_BASE + 'Smile/')
print('OPENFACE: ')
display(of_metrics_df)

In [None]:
ogau_metrics_df, ogau_mean_fpr, ogau_mean_tpr = train_and_evaluate(opengraphau_smile, Final_Smile_Labels, pipeline_name='OpenGraphAU', results_path=RESULTS_PATH_BASE + 'Smile/')
print('OPENGRAPHAU: ')
display(ogau_metrics_df)

In [None]:
hse_metrics_df, hse_mean_fpr, hse_mean_tpr = train_and_evaluate(hsemotion_smile, Final_Smile_Labels, pipeline_name='HSEmotion', results_path=RESULTS_PATH_BASE + 'Smile/')
print('HSEMOTION: ')
display(hse_metrics_df)

## Smile Checking shuffled labels

In [None]:
s_of_metrics_df, s_of_mean_fpr, s_of_mean_tpr = train_and_evaluate(openface_smile, Shuffled_Smile_Labels, pipeline_name='Shuffled OpenFace', results_path=RESULTS_PATH_BASE + 'Smile/')
print('OPENFACE SHUFFLED: ')
display(s_of_metrics_df)

In [None]:
s_ogau_metrics_df, s_ogau_mean_fpr, s_ogau_mean_tpr = train_and_evaluate(opengraphau_smile, Shuffled_Smile_Labels, pipeline_name='Shuffled OpenGraphAU', results_path=RESULTS_PATH_BASE + 'Smile/')
print('OPENGRAPHAU SHUFFLED: ')
display(s_ogau_metrics_df)

In [None]:
s_hse_metrics_df, s_hse_mean_fpr, s_hse_mean_tpr = train_and_evaluate(hsemotion_smile, Shuffled_Smile_Labels, pipeline_name='Shuffled HSEmotion', results_path=RESULTS_PATH_BASE + 'Smile/')
print('HSEMOTION SHUFFLED: ')
display(s_hse_metrics_df)

## Yawn Results: Train + Eval (5-Fold CV)

In [None]:
of_metrics_df, of_mean_fpr, of_mean_tpr = train_and_evaluate(openface_yawn, Final_Yawn_Labels, pipeline_name='OpenFace', results_path=RESULTS_PATH_BASE + 'Yawn/')
print('OPENFACE: ')
display(of_metrics_df)

In [None]:
ogau_metrics_df, ogau_mean_fpr, ogau_mean_tpr = train_and_evaluate(opengraphau_yawn, Final_Yawn_Labels, pipeline_name='OpenGraphAU', results_path=RESULTS_PATH_BASE + 'Yawn/')
print('OPENGRAPHAU: ')
display(ogau_metrics_df)

In [None]:
hse_metrics_df, hse_mean_fpr, hse_mean_tpr = train_and_evaluate(hsemotion_yawn, Final_Yawn_Labels, pipeline_name='HSEmotion', results_path=RESULTS_PATH_BASE + 'Yawn/')
print('HSEMOTION: ')
display(hse_metrics_df)

## Yawn Checking shuffled labels

In [None]:
s_of_metrics_df, s_of_mean_fpr, s_of_mean_tpr = train_and_evaluate(openface_yawn, Shuffled_Yawn_Labels, pipeline_name='Shuffled OpenFace', results_path=RESULTS_PATH_BASE + 'Yawn/')
print('OPENFACE SHUFFLED: ')
display(s_of_metrics_df)

In [None]:
s_ogau_metrics_df, s_ogau_mean_fpr, s_ogau_mean_tpr = train_and_evaluate(opengraphau_yawn, Shuffled_Yawn_Labels, pipeline_name='Shuffled OpenGraphAU', results_path=RESULTS_PATH_BASE + 'Yawn/')
print('OPENGRAPHAU SHUFFLED: ')
display(s_ogau_metrics_df)

In [None]:
s_hse_metrics_df, s_hse_mean_fpr, s_hse_mean_tpr = train_and_evaluate(hsemotion_yawn, Shuffled_Yawn_Labels, pipeline_name='Shuffled HSEmotion', results_path=RESULTS_PATH_BASE + 'Yawn/')
print('HSEMOTION SHUFFLED: ')
display(s_hse_metrics_df)

## Discomfort Results: Train + Eval (5-Fold CV)

In [None]:
of_metrics_df, of_mean_fpr, of_mean_tpr = train_and_evaluate(openface_discomfort, Final_Discomfort_Labels, pipeline_name='OpenFace', results_path=RESULTS_PATH_BASE + 'Discomfort/')
print('OPENFACE: ')
display(of_metrics_df)

In [None]:
ogau_metrics_df, ogau_mean_fpr, ogau_mean_tpr = train_and_evaluate(opengraphau_discomfort, Final_Discomfort_Labels, pipeline_name='OpenGraphAU', results_path=RESULTS_PATH_BASE + 'Discomfort/')
print('OPENGRAPHAU: ')
display(ogau_metrics_df)

In [None]:
hse_metrics_df, hse_mean_fpr, hse_mean_tpr = train_and_evaluate(hsemotion_discomfort, Final_Discomfort_Labels, pipeline_name='HSEmotion', results_path=RESULTS_PATH_BASE + 'Discomfort/')
print('HSEMOTION: ')
display(hse_metrics_df)

## Discomfort Checking shuffled labels

In [None]:
s_of_metrics_df, s_of_mean_fpr, s_of_mean_tpr = train_and_evaluate(openface_discomfort, Shuffled_Discomfort_Labels, pipeline_name='Shuffled OpenFace', results_path=RESULTS_PATH_BASE + 'Discomfort/')
print('OPENFACE SHUFFLED: ')
display(s_of_metrics_df)

In [None]:
s_ogau_metrics_df, s_ogau_mean_fpr, s_ogau_mean_tpr = train_and_evaluate(opengraphau_discomfort, Shuffled_Discomfort_Labels, pipeline_name='Shuffled OpenGraphAU', results_path=RESULTS_PATH_BASE + 'Discomfort/')
print('OPENGRAPHAU SHUFFLED: ')
display(s_ogau_metrics_df)

In [None]:
s_hse_metrics_df, s_hse_mean_fpr, s_hse_mean_tpr = train_and_evaluate(hsemotion_discomfort, Shuffled_Discomfort_Labels, pipeline_name='Shuffled HSEmotion', results_path=RESULTS_PATH_BASE + 'Discomfort/')
print('HSEMOTION SHUFFLED: ')
display(s_hse_metrics_df)

## Sad Results: Train + Eval (5-Fold CV)

In [None]:
of_metrics_df, of_mean_fpr, of_mean_tpr = train_and_evaluate(openface_sad, Final_Sad_Labels, pipeline_name='OpenFace', results_path=RESULTS_PATH_BASE + 'Sad/')
print('OPENFACE: ')
display(of_metrics_df)

In [None]:
ogau_metrics_df, ogau_mean_fpr, ogau_mean_tpr = train_and_evaluate(opengraphau_sad, Final_Sad_Labels, pipeline_name='OpenGraphAU', results_path=RESULTS_PATH_BASE + 'Sad/')
print('OPENGRAPHAU: ')
display(ogau_metrics_df)

In [None]:
hse_metrics_df, hse_mean_fpr, hse_mean_tpr = train_and_evaluate(hsemotion_sad, Final_Sad_Labels, pipeline_name='HSEmotion', results_path=RESULTS_PATH_BASE + 'Sad/')
print('HSEMOTION: ')
display(hse_metrics_df)

## Sad Checking shuffled labels

In [None]:
s_of_metrics_df, s_of_mean_fpr, s_of_mean_tpr = train_and_evaluate(openface_sad, Shuffled_Sad_Labels, pipeline_name='Shuffled OpenFace', results_path=RESULTS_PATH_BASE + 'Sad/')
print('OPENFACE SHUFFLED: ')
display(s_of_metrics_df)

In [None]:
s_ogau_metrics_df, s_ogau_mean_fpr, s_ogau_mean_tpr = train_and_evaluate(opengraphau_sad, Shuffled_Sad_Labels, pipeline_name='Shuffled OpenGraphAU', results_path=RESULTS_PATH_BASE + 'Sad/')
print('OPENGRAPHAU SHUFFLED: ')
display(s_ogau_metrics_df)

In [None]:
s_hse_metrics_df, s_hse_mean_fpr, s_hse_mean_tpr = train_and_evaluate(hsemotion_sad, Shuffled_Sad_Labels, pipeline_name='Shuffled HSEmotion', results_path=RESULTS_PATH_BASE + 'Sad/')
print('HSEMOTION SHUFFLED: ')
display(s_hse_metrics_df)