# Parsing

In [139]:
base_directory = 'data'

all_files = [
    'TRANSCRIPT.csv',
    'AUDIO.wav',
    'FORMANT.csv',
    'COVAREP.csv',
    'CLNF_gaze.txt',
    'CLNF_AUs.txt',
    'CLNF_hog.bin',
    'CLNF_features.txt',
    'CLNF_pose.txt',
    'CLNF_features3D.txt'
]
text_files = [
    'TRANSCRIPT.csv',
]
audio_files = [
    'AUDIO.wav',
    'FORMANT.csv',
    'COVAREP.csv'
]
face_files = [
    'CLNF_gaze.txt',
    'CLNF_AUs.txt',
    'CLNF_hog.bin',
    'CLNF_features.txt',
    'CLNF_pose.txt',
    'CLNF_features3D.txt'
]

In [140]:
import os
import pandas as pd

# Function to retrieve the necessary files
def retrieve_files(base_dir = base_directory, required_files=[], folder_ids=None):
    # Create a list to store the data for the DataFrame
    data = []

    # Get the list of subfolders
    subfolders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

    # If folder_ids is specified, filter the subfolders list based on the provided IDs
    if folder_ids:
        subfolders = [f for f in subfolders if int(f.split('_')[0]) in folder_ids]

    # Iterate through each (filtered) subfolder in the base directory
    for subfolder in subfolders:
        subfolder_path = os.path.join(base_dir, subfolder)

        # Create a dictionary to store the file paths for the current subfolder
        formatted_subfolder = subfolder.split('_')[0]  # getting rid of _P
        subfolder_files = {'ID': formatted_subfolder}

        # Iterate through each required file
        for file_name in required_files:
            file_path = os.path.join(subfolder_path, f"{subfolder[:3]}_{file_name}")

            # Check if the file exists and add it to the dictionary
            formatted_file_name = file_name.split('.')[0]
            subfolder_files[formatted_file_name] = file_path if os.path.exists(file_path) else None

        # Append the dictionary to the data list
        data.append(subfolder_files)

    # Create a DataFrame from the data list
    df = pd.DataFrame(data)
    df['ID'] = df['ID'].astype(int)
    return df

In [141]:
phq_paths = [
    'testing/dev_split_Depression_AVEC2017.csv',
    'testing/full_test_split.csv',
    'testing/train_split_Depression_AVEC2017.csv'
]

def append_PHQ_Binary(df):
    phq_dataframes = []

    for path in phq_paths:
        phq_df = pd.read_csv(path)
        
        if 'PHQ_Binary' in phq_df.columns:
            phq_column = 'PHQ_Binary'
        elif 'PHQ8_Binary' in phq_df.columns:
            phq_column = 'PHQ8_Binary'
        else:
            continue
        
        phq_df = phq_df[['Participant_ID', phq_column]]
        phq_df.rename(columns={phq_column: 'PHQ_Binary'}, inplace=True)
        phq_dataframes.append(phq_df)

    combined_phq_df = pd.concat(phq_dataframes, ignore_index=True)
    df = df.merge(combined_phq_df, how='left', left_on='ID', right_on='Participant_ID')

    df.drop(columns=['Participant_ID'], inplace=True)
    df.dropna(subset=['PHQ_Binary'], inplace=True)
    return df



In [142]:
def get_balanced_subset(df, percentage, random_state):
    # Calculate the desired size of the subset
    target_size = int(len(df) * percentage)
    
    # Split the dataframe by PHQ_Binary values
    df_0 = df[df['PHQ_Binary'] == 0]
    df_1 = df[df['PHQ_Binary'] == 1]
    
    # Determine the maximum number of samples for each PHQ_Binary group
    max_samples_per_class = min(len(df_0), len(df_1), target_size // 2)
    
    # Sample from each group
    sampled_0 = df_0.sample(n=max_samples_per_class, random_state=random_state)
    sampled_1 = df_1.sample(n=max_samples_per_class, random_state=random_state)
    
    # Combine the samples and shuffle
    balanced_subset = pd.concat([sampled_0, sampled_1]).sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return balanced_subset

In [143]:
import util.pp_text as pp_text
import util.pp_audio as pp_audio
import util.pp_face as pp_face

def get_feature_subset_df(base_dir=base_directory, required_files=all_files, folder_ids=None):
    path_map = retrieve_files(base_dir, required_files, folder_ids)
    df = pd.DataFrame()
    feature_map = {
        'TRANSCRIPT': (pp_text.preprocess_TRANSCRIPT, 'TRANSCRIPT_'),
        'AUDIO': (pp_audio.preprocess_AUDIO, 'AUDIO_'),
        'FORMANT': (pp_audio.preprocess_FORMANT, 'FORMANT_'),
        'COVAREP': (pp_audio.preprocess_COVAREP, 'COVAREP_'),
        'CLNF_gaze': (pp_face.preprocess_CLNF_gaze, 'CLNFgaze_'),
        'CLNF_AUs': (pp_face.preprocess_CLNF_AUs, 'CLNFAUs_'),
        'CLNF_hog': (pp_face.preprocess_CLNF_hog, 'CLNFhog_'),
        'CLNF_features': (pp_face.preprocess_CLNF_features, 'CLNFfeatures_'),
        'CLNF_pose': (pp_face.preprocess_CLNF_pose, 'CLNFpose_'),
        'CLNF_features3D': (pp_face.preprocess_CLNF_features3D, 'CLNFfeatures3D_')
    }
    
    for i in path_map.index:
        df_concat = pd.DataFrame()
        # Loop through feature_map and process if column exists in pathMap
        for column, (preprocess_func, prefix) in feature_map.items():
            if column in path_map.columns:
                processed_feature = preprocess_func(path_map[column][i]).add_prefix(prefix)
                df_concat = pd.concat([df_concat, processed_feature], axis=1)
        # Add ID
        df_concat['ID'] = path_map['ID'].iloc[i]
        # Add PHQ binary   
        # df_concat = append_PHQ_Binary(df_concat) 
        
        # Append the concatenated dataframe for each index to the main df
        df = pd.concat([df, df_concat], ignore_index=True)
        
    # Format column names properly
    df.columns = df.columns.str.replace(r'[^\w]', '', regex=True)

    return df


# testing method above:

In [144]:
def get_subset(percentage, random_state):
    df_all = retrieve_files()
    df_all = append_PHQ_Binary(df_all)
    df_subset = get_balanced_subset(df_all, percentage, random_state)
    subset = df_subset['ID'].to_list()
    return subset

In [150]:
def get_id(percentage = 0, random_state = 42):
    df = retrieve_files(folder_ids=get_subset(percentage, random_state))
    df = append_PHQ_Binary(df)
    df.set_index('ID', inplace=True)
    
    return df


get_id(0.1)

Unnamed: 0_level_0,PHQ_Binary
ID,Unnamed: 1_level_1
386,1
388,1
421,1
476,0
413,1
366,0
322,0
391,0
353,1
443,0


In [146]:
def get_text(percentage = 0, random_state = 42):
    text_test = [
        'TRANSCRIPT.csv',
    ]
    
    df = get_feature_subset_df(required_files=text_test, folder_ids=get_subset(percentage, random_state))
    df.set_index('ID', inplace=True)
    
    return df


get_text(0.1)


Unnamed: 0_level_0,TRANSCRIPT_text
ID,Unnamed: 1_level_1
386,synch introv4confirmation hi im ellie thanks f...
388,sync introv4confirmation hi im ellie thanks fo...
421,synch introv4confirmation hi im ellie thanks f...
476,synch introv4confirmation hi im ellie thanks f...
413,sync introv4confirmation hi im ellie thanks fo...
366,synch introv4confirmation hi im ellie thanks f...
322,hi im ellie thanks for coming in today i was c...
391,sync introv4confirmation hi im ellie thanks fo...
353,hi im ellie thanks for coming in today i was c...
443,synch introv4confirmation hi im ellie thanks f...


In [147]:
def get_audio(percentage = 0, random_state = 42):
    audio_test = [
        #'AUDIO.wav',
        'FORMANT.csv',
        'COVAREP.csv'
    ]
    
    df = get_feature_subset_df(required_files=audio_test, folder_ids=get_subset(percentage, random_state))
    # Handle duplicate timestamp columns
    df['TIMESTAMP'] = df['FORMANT_timestamp']
    df = df.drop(columns=[col for col in df.columns if 'timestamp' in col and col != 'TIMESTAMP'])
    # Index & timestamp setup
    df['TIMESTAMP'] = pd.to_timedelta(df['TIMESTAMP'], unit='s')
    df.set_index(['ID', 'TIMESTAMP'], inplace=True)
    # down-sampling the high frequency audio data to match the low frequency facial data for multi-modality
    df = df.groupby('ID').resample('33.3311ms', level='TIMESTAMP').mean()
    # Rounding TIMESTAMP for consistency between audio & face data
    df = df.reset_index()
    df['TIMESTAMP'] = df['TIMESTAMP'].apply(lambda x: x.round('10ms'))
    df.set_index(['ID', 'TIMESTAMP'], inplace=True)
    
    return df


#get_audio()


In [148]:
def get_face(percentage = 0, random_state = 42):
    face_test = [
        'CLNF_gaze.txt',
        'CLNF_AUs.txt',
        #'CLNF_hog.bin',
        'CLNF_features.txt',
        'CLNF_pose.txt',
        'CLNF_features3D.txt'
    ]
    
    df = get_feature_subset_df(required_files=face_test, folder_ids=get_subset(percentage, random_state))
    # Get rid of unnecessary columns
    df = df.drop(columns=[col for col in df.columns if
                          any(substring in col for substring in ['frame', 'confidence', 'success'])])
    # Handle duplicate timestamp columns
    df['TIMESTAMP'] = df['CLNFgaze_timestamp']
    df = df.drop(columns=[col for col in df.columns if 'timestamp' in col and col != 'TIMESTAMP'])
    # Index & timestamp setup
    df['TIMESTAMP'] = pd.to_timedelta(df['TIMESTAMP'], unit='s')
    df.set_index(['ID', 'TIMESTAMP'], inplace=True)
    # Rounding TIMESTAMP for consistency between audio & face data
    df = df.reset_index()
    df['TIMESTAMP'] = df['TIMESTAMP'].apply(lambda x: x.round('10ms'))
    df.set_index(['ID', 'TIMESTAMP'], inplace=True)
    
    return df


#get_face()

In [149]:
def sliding_window(df, user_id_col, feature_cols, window_size):
    def calculate_rolling_mean(group):
        return group[feature_cols].rolling(window=window_size).mean()
    
    # Apply the sliding window mean to each user group
    result_df = df.groupby(user_id_col).apply(calculate_rolling_mean).reset_index(level=0, drop=True)
    
    # Retain the original user ID and timestamp (if present)
    for col in df.columns:
        if col != user_id_col and col not in feature_cols:
            result_df[col] = df[col]
    
    return result_df