# Parsing       

In [3]:
import os
import pandas as pd
# Define the directory containing the subfolders
base_directory = 'data'

# Function to retrieve the necessary files
def retrieve_files(base_dir, required_files):
    # Create a list to store the data for the DataFrame
    data = []

    # Iterate through each subfolder in the base directory
    for subfolder in os.listdir(base_dir):
        subfolder_path = os.path.join(base_dir, subfolder)

        # Ensure the path is a directory
        if os.path.isdir(subfolder_path):
            # Create a dictionary to store the file paths for the current subfolder
            formatted_subfolder = subfolder.split('_')[0] #getting rid of _P
            subfolder_files = {'ID': formatted_subfolder}

            # Iterate through each required file
            for file_name in required_files:
                file_path = os.path.join(subfolder_path, f"{subfolder[:3]}_{file_name}")

                # Check if the file exists and add it to the dictionary
                formatted_file_name = file_name.split('.')[0]
                subfolder_files[formatted_file_name] = file_path if os.path.exists(file_path) else None

            # Append the dictionary to the data list
            data.append(subfolder_files)

    # Create a DataFrame from the data list
    df = pd.DataFrame(data)
    return df

In [4]:
#phq_path = 'testing/full_test_split.csv'
phq_paths = [
    'testing/dev_split_Depression_AVEC2017.csv', 
    'testing/full_test_split.csv',
    'testing/test_split_Depression_AVEC2017.csv',
    'testing/train_split_Depression_AVEC2017.csv'
]

def append_PHQ_Binary(df):
    # TODO: make sure to go through all of the testing files
    for path in phq_paths:
        phq_df = pd.read_csv(path)
        phq_df = phq_df[['Participant_ID', 'PHQ_Binary']]
    
        df = df.merge(phq_df, how='left', left_on='ID', right_on='Participant_ID')
        df.drop(columns=['Participant_ID'], inplace=True)
        #df.dropna(subset=['PHQ_Binary'], inplace=True)
    return df

# Preprocessing text

In [88]:
text_files = [
    'TRANSCRIPT.csv'
]

df_text_files = retrieve_files(base_directory, text_files)
df_text_files

Unnamed: 0,ID,TRANSCRIPT
0,475,data/475_P/475_TRANSCRIPT.csv
1,386,data/386_P/386_TRANSCRIPT.csv
2,361,data/361_P/361_TRANSCRIPT.csv
3,492,data/492_P/492_TRANSCRIPT.csv
4,414,data/414_P/414_TRANSCRIPT.csv
...,...,...
184,464,data/464_P/464_TRANSCRIPT.csv
185,420,data/420_P/420_TRANSCRIPT.csv
186,334,data/334_P/334_TRANSCRIPT.csv
187,441,data/441_P/441_TRANSCRIPT.csv


In [89]:
import re

def preprocess_TRANSCRIPT(path):
    df = pd.read_csv(path, delimiter='\t')
    # Combine all rows of the 'value' column into one string
    transcript = ' '.join(df['value'].astype(str).tolist())
    # Clean the text
    transcript = re.sub(r'[^\w\s]', '', transcript.lower())
    return transcript


In [89]:
df_text = []

for i in df_text_files:
    # preprocess all the features
    transcript = preprocess_TRANSCRIPT(i.TRANSCRIPT)

    # concatenate all the features and add ID 
    df_concat['ID'] = df_text_files['ID']

    # add PHQ binary
    df_concat = append_PHQ_Binary(df_concat)
    
    # append
    df_text.append(df_concat)
    
df_text

# Preprocessing audio

In [90]:
audio_files = [
    'AUDIO.wav',
    'FORMANT.csv',
    'COVAREP.csv'
]

df_audio_files = retrieve_files(base_directory, audio_files)
df_audio_files

Unnamed: 0,ID,AUDIO,FORMANT,COVAREP
0,475,data/475_P/475_AUDIO.wav,data/475_P/475_FORMANT.csv,data/475_P/475_COVAREP.csv
1,386,data/386_P/386_AUDIO.wav,data/386_P/386_FORMANT.csv,data/386_P/386_COVAREP.csv
2,361,data/361_P/361_AUDIO.wav,data/361_P/361_FORMANT.csv,data/361_P/361_COVAREP.csv
3,492,data/492_P/492_AUDIO.wav,data/492_P/492_FORMANT.csv,data/492_P/492_COVAREP.csv
4,414,data/414_P/414_AUDIO.wav,data/414_P/414_FORMANT.csv,data/414_P/414_COVAREP.csv
...,...,...,...,...
184,464,data/464_P/464_AUDIO.wav,data/464_P/464_FORMANT.csv,data/464_P/464_COVAREP.csv
185,420,data/420_P/420_AUDIO.wav,data/420_P/420_FORMANT.csv,data/420_P/420_COVAREP.csv
186,334,data/334_P/334_AUDIO.wav,data/334_P/334_FORMANT.csv,data/334_P/334_COVAREP.csv
187,441,data/441_P/441_AUDIO.wav,data/441_P/441_FORMANT.csv,data/441_P/441_COVAREP.csv


In [91]:
import numpy as np
import librosa


def preprocess_AUDIO(path):
    y, sr = librosa.load(path, sr=None)

    # Extract various features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)

    # Aggregate the features
    mfccs_mean = np.mean(mfccs, axis=1)
    chroma_mean = np.mean(chroma, axis=1)
    zcr_mean = np.mean(zcr, axis=1)

    a = np.concatenate((mfccs_mean, chroma_mean, zcr_mean), axis=0)
    return 0

def preprocess_FORMANT(path):
    # formant_features = pd.read_csv(path, delimiter='\t')
    # return formant_features


def preprocess_COVAREP(path):
    # TODO: I probably need to use the COVAREP library for this
    # covarep_features = pd.read_csv(path, delimiter='\t')
    # return covarep_features

In [91]:
df_audio = []

for i in df_audio_files:
    # preprocess all the features
    audio = preprocess_AUDIO(i.AUDIO)
    formant = preprocess_FORMANT(i.FORMANT)
    covarep = preprocess_COVAREP(i.COVAREP)

    # concatenate all the features and add ID 
    df_concat = pd.concat([audio, formant, covarep], axis=1)
    df_concat['ID'] = df_audio_files['ID']
    
    # add PHQ binary
    df_concat = append_PHQ_Binary(df_concat)
    
    # append
    df_audio.append(df_concat)

df_audio

In [26]:
import pandas as pd
        
df = pd.read_csv('data/300_P/300_FORMANT.csv', header=None)
FORMANT_FEATURES = ['F1', 'F2', 'F3', 'F4', 'F5']
df.columns = FORMANT_FEATURES
df


Unnamed: 0,F1,F2,F3,F4,F5
0,753.36,2405.3,3081.5,3865.9,4280.8
1,812.41,2203.2,3121.1,3598.4,4116.2
2,827.90,2176.6,3047.9,3523.0,4095.6
3,832.05,2090.4,2807.2,3303.3,4184.8
4,882.31,2097.5,2636.9,3078.0,4321.1
...,...,...,...,...,...
64845,457.00,1473.0,2597.5,3160.0,3629.0
64846,500.00,1527.5,2562.5,3261.5,4059.0
64847,523.50,1566.5,2461.0,3429.5,4566.5
64848,531.00,1531.5,2406.0,3554.5,4679.5


In [27]:
import pandas as pd
        
df = pd.read_csv('data/300_P/300_COVAREP.csv', header=None)
# something is off here
COVAREP_FEATURES = [
    'F0',
    'VUV',
    'NAQ',
    'QOQ',
    'H1H2',
    'PSP',
    'MDQ',
    'peakSlope',
    'Rd',
    'Rd_conf',
    
    'MCEP_0',
    'MCEP_1',
    'MCEP_2',
    'MCEP_3',
    'MCEP_4',
    'MCEP_5',
    'MCEP_6',
    'MCEP_7',
    'MCEP_8',
    'MCEP_9',
    'MCEP_10',
    'MCEP_11',
    'MCEP_12',
    'MCEP_13',
    'MCEP_14',
    'MCEP_15',
    'MCEP_16',
    'MCEP_17',
    'MCEP_18',
    'MCEP_19',
    'MCEP_20',
    'MCEP_21',
    'MCEP_22',
    'MCEP_23',
    'MCEP_24',

    'HMPDM_0',
    'HMPDM_1',
    'HMPDM_2',
    'HMPDM_3',
    'HMPDM_4',
    'HMPDM_5',
    'HMPDM_6',
    'HMPDM_7',
    'HMPDM_8',
    'HMPDM_9',
    'HMPDM_10',
    'HMPDM_11',
    'HMPDM_12',
    'HMPDM_13',
    'HMPDM_14',
    'HMPDM_15',
    'HMPDM_16',
    'HMPDM_17',
    'HMPDM_18',
    'HMPDM_19',
    'HMPDM_20',
    'HMPDM_21',
    'HMPDM_22',
    'HMPDM_23',
    'HMPDM_24',

    'HMPDD_0',
    'HMPDD_1',
    'HMPDD_2',
    'HMPDD_3',
    'HMPDD_4',
    'HMPDD_5',
    'HMPDD_6',
    'HMPDD_7',
    'HMPDD_8',
    'HMPDD_9',
    'HMPDD_10',
    'HMPDD_11',
    'HMPDD_12',
    'HMPDD_13',
]
df.columns = COVAREP_FEATURES
df

Unnamed: 0,F0,VUV,NAQ,QOQ,H1H2,PSP,MDQ,peakSlope,Rd,Rd_conf,...,HMPDD_4,HMPDD_5,HMPDD_6,HMPDD_7,HMPDD_8,HMPDD_9,HMPDD_10,HMPDD_11,HMPDD_12,HMPDD_13
0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0,0.0,0.0,0.0,0.0,0.0,-0.30805,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0,0.0,0.0,0.0,0.0,0.0,-0.30553,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0,0.0,0.0,0.0,0.0,0.0,-0.32064,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64846,0.0,0,0.0,0.0,0.0,0.0,0.0,-0.44373,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64847,0.0,0,0.0,0.0,0.0,0.0,0.0,-0.44239,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64848,0.0,0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64849,0.0,0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Preprocessing face

In [92]:
face_files = [
    'CLNF_gaze.txt',
    'CLNF_AUs.txt',
    'CLNF_hog.bin', #TODO: 300 is a .txt file, not a .bin file
    'CLNF_features.txt',
    'CLNF_pose.txt',
    'CLNF_features3D.txt',
]

df_face_files = retrieve_files(base_directory, face_files)
df_face_files

Unnamed: 0,ID,CLNF_gaze,CLNF_AUs,CLNF_hog,CLNF_features,CLNF_pose,CLNF_features3D
0,475,data/475_P/475_CLNF_gaze.txt,data/475_P/475_CLNF_AUs.txt,data/475_P/475_CLNF_hog.bin,data/475_P/475_CLNF_features.txt,data/475_P/475_CLNF_pose.txt,data/475_P/475_CLNF_features3D.txt
1,386,data/386_P/386_CLNF_gaze.txt,data/386_P/386_CLNF_AUs.txt,data/386_P/386_CLNF_hog.bin,data/386_P/386_CLNF_features.txt,data/386_P/386_CLNF_pose.txt,data/386_P/386_CLNF_features3D.txt
2,361,data/361_P/361_CLNF_gaze.txt,data/361_P/361_CLNF_AUs.txt,data/361_P/361_CLNF_hog.bin,data/361_P/361_CLNF_features.txt,data/361_P/361_CLNF_pose.txt,data/361_P/361_CLNF_features3D.txt
3,492,data/492_P/492_CLNF_gaze.txt,data/492_P/492_CLNF_AUs.txt,data/492_P/492_CLNF_hog.bin,data/492_P/492_CLNF_features.txt,data/492_P/492_CLNF_pose.txt,data/492_P/492_CLNF_features3D.txt
4,414,data/414_P/414_CLNF_gaze.txt,data/414_P/414_CLNF_AUs.txt,data/414_P/414_CLNF_hog.bin,data/414_P/414_CLNF_features.txt,data/414_P/414_CLNF_pose.txt,data/414_P/414_CLNF_features3D.txt
...,...,...,...,...,...,...,...
184,464,data/464_P/464_CLNF_gaze.txt,data/464_P/464_CLNF_AUs.txt,data/464_P/464_CLNF_hog.bin,data/464_P/464_CLNF_features.txt,data/464_P/464_CLNF_pose.txt,data/464_P/464_CLNF_features3D.txt
185,420,data/420_P/420_CLNF_gaze.txt,data/420_P/420_CLNF_AUs.txt,data/420_P/420_CLNF_hog.bin,data/420_P/420_CLNF_features.txt,data/420_P/420_CLNF_pose.txt,data/420_P/420_CLNF_features3D.txt
186,334,data/334_P/334_CLNF_gaze.txt,data/334_P/334_CLNF_AUs.txt,data/334_P/334_CLNF_hog.bin,data/334_P/334_CLNF_features.txt,data/334_P/334_CLNF_pose.txt,data/334_P/334_CLNF_features3D.txt
187,441,data/441_P/441_CLNF_gaze.txt,data/441_P/441_CLNF_AUs.txt,data/441_P/441_CLNF_hog.bin,data/441_P/441_CLNF_features.txt,data/441_P/441_CLNF_pose.txt,data/441_P/441_CLNF_features3D.txt


In [38]:
def preprocess_CLNF_gaze(path):
    gaze_df = pd.read_csv(path)
    return gaze_df

def preprocess_CLNF_AUs(path):
    au_df = pd.read_csv(path)
    return au_df

def preprocess_CLNF_hog(path):
    with open(path, 'rb') as f:
        curr_data = []
        curr_ind = 0

        while True:
            if curr_ind == 0:
                num_cols = np.fromfile(f, dtype=np.int32, count=1)
                if num_cols.size == 0:
                    break

                num_rows = np.fromfile(f, dtype=np.int32, count=1)[0]
                num_chan = np.fromfile(f, dtype=np.int32, count=1)[0]

                curr_ind += 1

                if curr_ind == 1:
                    curr_data = np.zeros((1000, 1 + num_rows * num_cols * num_chan))
                    num_feats = 1 + num_rows * num_cols * num_chan

                if curr_ind > curr_data.shape[0]:
                    curr_data = np.vstack([curr_data, np.zeros((1000, num_feats))])

                feature_vec = np.fromfile(f, dtype=np.float32, count=1 + num_rows * num_cols * num_chan)
                curr_data[curr_ind - 1, :] = feature_vec
            else:
                feature_vec = np.fromfile(f, dtype=np.float32, count=(4 + num_rows * num_cols * num_chan) * 5000)
                feature_vec = feature_vec.reshape((-1, 4 + num_rows * num_cols * num_chan))[:, 4:]

                num_rows_read = feature_vec.shape[0]

                if feature_vec.size > 0:
                    if curr_ind + num_rows_read > curr_data.shape[0]:
                        curr_data = np.vstack([curr_data, np.zeros((num_rows_read, num_feats))])
                    curr_data[curr_ind:curr_ind + num_rows_read, :] = feature_vec
                    curr_ind += num_rows_read
                else:
                    break

        curr_data = curr_data[:curr_ind, :]

    # Convert to DataFrame
    df = pd.DataFrame(curr_data)
    
    return df


def preprocess_CLNF_features(path):
    features_df = pd.read_csv(path)
    return features_df

def preprocess_CLNF_pose(path):
    pose_df = pd.read_csv(path)
    return pose_df

def preprocess_CLNF_features3D(path):
    features3D_df = pd.read_csv(path)
    return features3D_df

In [93]:
df_face = []

for i in df_face_files:
    # preprocess all the features
    clnf_gaze = preprocess_CLNF_gaze(i.CLNF_gaze)
    clnf_aus = preprocess_CLNF_AUs(i.CLNF_AUs)
    clnf_hog = preprocess_CLNF_hog(i.CLNF_hog)
    clnf_features = preprocess_CLNF_features(i.CLNF_features)
    clnf_pose = preprocess_CLNF_pose(i.CLNF_pose)
    clnf_features3d = preprocess_CLNF_features3D(i.CLNF_features3D)
    
    # concatenate all the features and add ID 
    df_concat = pd.concat([clnf_gaze, clnf_aus, clnf_hog, clnf_features, clnf_pose, clnf_features3d], axis=1)
    df_concat['ID'] = df_face_files['ID']
    
    # add PHQ binary
    df_concat = append_PHQ_Binary(df_concat)
    
    # append
    df_face.append(df_concat)
    
df_face


In [1]:
# READ HOG FILES

import numpy as np
import struct
import pandas as pd

def read_hog(filename, batch_size=5000):
    all_feature_vectors = []
    with open(filename, "rb") as f:
        num_cols, = struct.unpack("i", f.read(4))
        num_rows, = struct.unpack("i", f.read(4))
        num_channels, = struct.unpack("i", f.read(4))

        # The first four bytes encode a boolean value whether the frame is valid
        num_features = 1 + num_rows * num_cols * num_channels
        feature_vector = struct.unpack("{}f".format(num_features), f.read(num_features * 4))
        feature_vector = np.array(feature_vector).reshape((1, num_features))
        all_feature_vectors.append(feature_vector)

        # Every frame contains a header of four float values: num_cols, num_rows, num_channels, is_valid
        num_floats_per_feature_vector = 4 + num_rows * num_cols * num_channels
        # Read in batches of given batch_size
        num_floats_to_read = num_floats_per_feature_vector * batch_size
        # Multiply by 4 because of float32
        num_bytes_to_read = num_floats_to_read * 4

        while True:
            bytes = f.read(num_bytes_to_read)
            # For comparison how many bytes were actually read
            num_bytes_read = len(bytes)
            if num_bytes_read == 0:
                break

            assert num_bytes_read % 4 == 0, "Number of bytes read does not match with float size"
            num_floats_read = num_bytes_read // 4
            assert num_floats_read % num_floats_per_feature_vector == 0, "Number of bytes read does not match with feature vector size"
            num_feature_vectors_read = num_floats_read // num_floats_per_feature_vector

            feature_vectors = struct.unpack("{}f".format(num_floats_read), bytes)
            # Convert to array
            feature_vectors = np.array(feature_vectors).reshape((num_feature_vectors_read, num_floats_per_feature_vector))
            # Discard the first three values in each row (num_cols, num_rows, num_channels)
            feature_vectors = feature_vectors[:, 3:]
            # Append to list of all feature vectors that have been read so far
            all_feature_vectors.append(feature_vectors)

            if num_bytes_read < num_bytes_to_read:
                break

        # Concatenate batches
        all_feature_vectors = np.concatenate(all_feature_vectors, axis=0)

        # Split into is-valid and feature vectors
        is_valid = all_feature_vectors[:, 0]
        hog_features = all_feature_vectors[:, 1:]

        # Create DataFrame
        df = pd.DataFrame({
            'is_valid': is_valid,
            'hog_features': list(hog_features)  # Store each row as a list in a DataFrame cell
        })

        return df
    
# Example usage
df = read_hog('data/301_P/301_CLNF_hog.bin')
print(df)

       is_valid                                       hog_features
0           1.0  [0.2149130403995514, 0.29190343618392944, 0.40...
1           1.0  [0.1428118348121643, 0.2792850732803345, 0.400...
2           1.0  [0.16635194420814514, 0.2612675428390503, 0.40...
3           1.0  [0.10758522897958755, 0.2900768220424652, 0.40...
4           1.0  [0.08714132010936737, 0.2694597542285919, 0.40...
...         ...                                                ...
24716      -1.0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
24717      -1.0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
24718      -1.0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
24719      -1.0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
24720      -1.0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...

[24721 rows x 2 columns]
