In [1]:
%run UTILS.ipynb

import pandas as pd
import numpy as np
import struct

In [2]:
face_files = [
    'CLNF_gaze.txt',
    'CLNF_AUs.txt',
    'CLNF_hog.bin', #TODO: 300 is a .txt file, not a .bin file
    'CLNF_features.txt',
    'CLNF_pose.txt',
    'CLNF_features3D.txt',
]

df_face_files = retrieve_files(base_directory, face_files, num_folders=1)
df_face_files  

Unnamed: 0,ID,CLNF_gaze,CLNF_AUs,CLNF_hog,CLNF_features,CLNF_pose,CLNF_features3D
0,475,data/475_P/475_CLNF_gaze.txt,data/475_P/475_CLNF_AUs.txt,data/475_P/475_CLNF_hog.bin,data/475_P/475_CLNF_features.txt,data/475_P/475_CLNF_pose.txt,data/475_P/475_CLNF_features3D.txt


In [3]:
# READ HOG FILES
def read_hog(filename, batch_size=5000):
    all_feature_vectors = []
    with open(filename, "rb") as f:
        num_cols, = struct.unpack("i", f.read(4))
        num_rows, = struct.unpack("i", f.read(4))
        num_channels, = struct.unpack("i", f.read(4))

        # The first four bytes encode a boolean value whether the frame is valid
        num_features = 1 + num_rows * num_cols * num_channels
        feature_vector = struct.unpack("{}f".format(num_features), f.read(num_features * 4))
        feature_vector = np.array(feature_vector).reshape((1, num_features))
        all_feature_vectors.append(feature_vector)

        # Every frame contains a header of four float values: num_cols, num_rows, num_channels, is_valid
        num_floats_per_feature_vector = 4 + num_rows * num_cols * num_channels
        # Read in batches of given batch_size
        num_floats_to_read = num_floats_per_feature_vector * batch_size
        # Multiply by 4 because of float32
        num_bytes_to_read = num_floats_to_read * 4

        while True:
            bytes = f.read(num_bytes_to_read)
            # For comparison how many bytes were actually read
            num_bytes_read = len(bytes)
            if num_bytes_read == 0:
                break

            assert num_bytes_read % 4 == 0, "Number of bytes read does not match with float size"
            num_floats_read = num_bytes_read // 4
            assert num_floats_read % num_floats_per_feature_vector == 0, "Number of bytes read does not match with feature vector size"
            num_feature_vectors_read = num_floats_read // num_floats_per_feature_vector

            feature_vectors = struct.unpack("{}f".format(num_floats_read), bytes)
            # Convert to array
            feature_vectors = np.array(feature_vectors).reshape(
                (num_feature_vectors_read, num_floats_per_feature_vector))
            # Discard the first three values in each row (num_cols, num_rows, num_channels)
            feature_vectors = feature_vectors[:, 3:]
            # Append to list of all feature vectors that have been read so far
            all_feature_vectors.append(feature_vectors)

            if num_bytes_read < num_bytes_to_read:
                break

        # Concatenate batches
        all_feature_vectors = np.concatenate(all_feature_vectors, axis=0)

        # Split into is-valid and feature vectors
        is_valid = all_feature_vectors[:, 0]
        hog_features = all_feature_vectors[:, 1:]

        # Create DataFrame
        df = pd.DataFrame({
            'is_valid': is_valid,
            'hog_features': list(hog_features)  # Store each row as a list in a DataFrame cell
        })

        return df

In [4]:
def preprocess_CLNF_gaze(path): # good
    gaze_df = pd.read_csv(path)
    return gaze_df

def preprocess_CLNF_AUs(path): # good
    au_df = pd.read_csv(path)
    return au_df

def preprocess_CLNF_hog(path): #TODO: 300 is .txt when it should be .bin
    df = read_hog(path)
    #df = df.values.flatten() # make it 1d so it works with the rest
    return df

def preprocess_CLNF_features(path):  # good
    features_df = pd.read_csv(path)
    return features_df

def preprocess_CLNF_pose(path):  # good
    pose_df = pd.read_csv(path)
    return pose_df

def preprocess_CLNF_features3D(path):  #good
    features3D_df = pd.read_csv(path)
    return features3D_df

In [5]:
#testing hog

clnf_hog = preprocess_CLNF_hog("data/475_P/475_CLNF_hog.bin")
print(clnf_hog)

#figure out wtf is happening here????


       is_valid                                       hog_features
0           1.0  [0.4000000059604645, 0.4000000059604645, 0.197...
1           1.0  [0.3999999761581421, 0.3999999761581421, 0.141...
2           1.0  [0.40000003576278687, 0.3326006531715393, 0.14...
3           1.0  [0.40000003576278687, 0.33028799295425415, 0.1...
4           1.0  [0.3999999761581421, 0.36977845430374146, 0.15...
...         ...                                                ...
17612       1.0  [0.2243298888206482, 0.0, 0.04327547177672386,...
17613       1.0  [0.22342944145202637, 0.0, 0.0, 0.0, 0.0, 0.0,...
17614       1.0  [0.23053738474845886, 0.0, 0.10868234187364578...
17615       1.0  [0.22569669783115387, 0.0, 0.0, 0.009045038372...
17616       1.0  [0.22470024228096008, 0.0, 0.0, 0.0, 0.0, 0.0,...

[17617 rows x 2 columns]


In [6]:
from IPython.display import clear_output
df_face = pd.DataFrame()

for i in df_face_files.index:
    df_concat = pd.DataFrame()
    # preprocess all the features
    clnf_gaze = preprocess_CLNF_gaze(df_face_files['CLNF_gaze'][i]).add_prefix('CLNFgaze_')
    clnf_aus = preprocess_CLNF_AUs(df_face_files['CLNF_AUs'][i]).add_prefix('CLNFaus_')
    # clnf_hog = preprocess_CLNF_hog(df_face_files['CLNF_hog'][i]).add_prefix('CLNFhog_') #TODO: figure out whats going on with this hog thing
    clnf_features = preprocess_CLNF_features(df_face_files['CLNF_features'][i]).add_prefix('CLNFfeatures_')
    clnf_pose = preprocess_CLNF_pose(df_face_files['CLNF_pose'][i]).add_prefix('CLNFpose_')
    clnf_features3d = preprocess_CLNF_features3D(df_face_files['CLNF_features3D'][i]).add_prefix('CLNFfeatures3D_')

    # concatenate all the features
    df_concat = pd.concat([ clnf_gaze, clnf_aus, clnf_features, clnf_pose, clnf_features3d], axis=1) #TODO: add hog later

    # ensure the ID is a valid value and assign it as a column
    df_concat['ID'] = df_face_files['ID'].iloc[i]

    # add PHQ binary and timestamps, get rid of deplicate / unnecessary columns and add timestamps
    df_concat = append_PHQ_Binary(df_concat)
    
    df_concat = df_concat.drop(columns=[col for col in df_concat.columns if any(substring in col for substring in ['frame', 'confidence', 'success'])])
    df_concat['TIMESTAMP'] = df_concat['CLNFgaze_ timestamp']
    df_concat = df_concat.drop(columns=[col for col in df_concat.columns if 'timestamp' in col and col != 'TIMESTAMP'])

    # append
    df_face = df_face._append(df_concat, ignore_index=True)
    
    clear_output()
    print("finished processing file", i)
clear_output()
df_face


Unnamed: 0,CLNFgaze_ x_0,CLNFgaze_ y_0,CLNFgaze_ z_0,CLNFgaze_ x_1,CLNFgaze_ y_1,CLNFgaze_ z_1,CLNFgaze_ x_h0,CLNFgaze_ y_h0,CLNFgaze_ z_h0,CLNFgaze_ x_h1,...,CLNFfeatures3D_ Z61,CLNFfeatures3D_ Z62,CLNFfeatures3D_ Z63,CLNFfeatures3D_ Z64,CLNFfeatures3D_ Z65,CLNFfeatures3D_ Z66,CLNFfeatures3D_ Z67,ID,PHQ_Binary,TIMESTAMP
0,0.019020,0.102879,-0.994512,-0.166677,0.134669,-0.976772,-0.203146,0.427972,-0.880665,-0.375722,...,661.085,658.375,656.659,663.261,655.168,656.344,659.501,475,0,0.000000
1,0.010488,0.105077,-0.994409,-0.169342,0.144812,-0.974860,-0.205663,0.437989,-0.875139,-0.372074,...,660.457,657.665,655.841,662.499,654.080,655.411,658.638,475,0,0.033333
2,-0.017352,0.085372,-0.996198,-0.108857,0.120194,-0.986764,-0.232787,0.427253,-0.873650,-0.316472,...,657.360,654.482,652.691,659.181,650.928,652.349,655.616,475,0,0.066667
3,-0.018266,0.086938,-0.996046,-0.103383,0.119994,-0.987377,-0.236072,0.427561,-0.872618,-0.313786,...,656.710,653.820,652.018,658.557,650.212,651.638,654.912,475,0,0.100000
4,-0.015493,0.086983,-0.996089,-0.103319,0.119921,-0.987393,-0.232449,0.428643,-0.873059,-0.312821,...,656.515,653.652,651.869,658.403,650.050,651.455,654.694,475,0,0.133333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17612,0.012627,0.151015,-0.988451,-0.129446,0.132676,-0.982670,-0.277152,0.403833,-0.871840,-0.409767,...,663.700,659.822,657.028,661.173,656.726,659.404,663.441,475,0,587.067000
17613,0.013936,0.150270,-0.988547,-0.128132,0.141222,-0.981651,-0.279752,0.404866,-0.870530,-0.409672,...,663.162,659.305,656.482,660.629,656.209,658.897,662.923,475,0,587.100000
17614,0.025188,0.154982,-0.987596,-0.131980,0.141585,-0.981089,-0.265609,0.410126,-0.872495,-0.410603,...,664.279,660.413,657.604,661.697,657.413,660.099,664.129,475,0,587.133000
17615,0.014218,0.143477,-0.989551,-0.139226,0.136743,-0.980774,-0.279082,0.401677,-0.872221,-0.418339,...,663.436,659.615,656.822,661.135,656.738,659.388,663.372,475,0,587.167000


In [ ]:
df_face.set_index('TIMESTAMP', inplace=True)
