In [8]:
import pandas as pd
import numpy as np
import struct

def read_hog(filename, batch_size=5000):
    all_feature_vectors = []
    with open(filename, "rb") as f:
        num_cols, = struct.unpack("i", f.read(4))
        num_rows, = struct.unpack("i", f.read(4))
        num_channels, = struct.unpack("i", f.read(4))

        # The first four bytes encode a boolean value whether the frame is valid
        num_features = 1 + num_rows * num_cols * num_channels
        feature_vector = struct.unpack("{}f".format(num_features), f.read(num_features * 4))
        feature_vector = np.array(feature_vector).reshape((1, num_features))
        all_feature_vectors.append(feature_vector)

        # Every frame contains a header of four float values: num_cols, num_rows, num_channels, is_valid
        num_floats_per_feature_vector = 4 + num_rows * num_cols * num_channels
        # Read in batches of given batch_size
        num_floats_to_read = num_floats_per_feature_vector * batch_size
        # Multiply by 4 because of float32
        num_bytes_to_read = num_floats_to_read * 4

        while True:
            bytes = f.read(num_bytes_to_read)
            # For comparison how many bytes were actually read
            num_bytes_read = len(bytes)
            if num_bytes_read == 0:
                break

            assert num_bytes_read % 4 == 0, "Number of bytes read does not match with float size"
            num_floats_read = num_bytes_read // 4
            assert num_floats_read % num_floats_per_feature_vector == 0, "Number of bytes read does not match with feature vector size"
            num_feature_vectors_read = num_floats_read // num_floats_per_feature_vector

            feature_vectors = struct.unpack("{}f".format(num_floats_read), bytes)
            # Convert to array
            feature_vectors = np.array(feature_vectors).reshape(
                (num_feature_vectors_read, num_floats_per_feature_vector))
            # Discard the first three values in each row (num_cols, num_rows, num_channels)
            feature_vectors = feature_vectors[:, 3:]
            # Append to list of all feature vectors that have been read so far
            all_feature_vectors.append(feature_vectors)

            if num_bytes_read < num_bytes_to_read:
                break

        # Concatenate batches
        all_feature_vectors = np.concatenate(all_feature_vectors, axis=0)


        # # Split into is-valid and feature vectors
        # is_valid = all_feature_vectors[:, 0]
        # hog_features = all_feature_vectors[:, 1:]

        # # Create DataFrame
        # df = pd.DataFrame({
        #     'is_valid': is_valid,
        #     'hog_features': list(hog_features)  # Store each row as a list in a DataFrame cell
        # })

        # return df




        # Split into is-valid and feature vectors
        is_valid = all_feature_vectors[:, 0]
        hog_features = all_feature_vectors[:, 1:]

        # Create column names for each HOG feature
        feature_columns = [f'hog_feature_{i}' for i in range(hog_features.shape[1])]
        
        # Create DataFrame with individual columns for each feature
        df = pd.DataFrame(hog_features, columns=feature_columns)
        df['is_valid'] = is_valid

        return df

In [9]:
# the read hog methid is correct, since i took it from github.
# just need to figure out what exactly hog is, how it's  structured and what i can do here...

# seems like i should be fine if i just remove the is_valid column and keep everything else as is (this is probabbly already removed by default)

hog = read_hog("/Users/karlo/College/Diplomski/Code/multimodal_depression_detection/DepressionDetection/data/301_P/301_CLNF_hog.bin")
# Add timestamps at 33.333ms intervals (30 fps)
hog['timestamp'] = pd.Series(np.arange(len(hog)) * 0.033333)

hog

Unnamed: 0,hog_feature_0,hog_feature_1,hog_feature_2,hog_feature_3,hog_feature_4,hog_feature_5,hog_feature_6,hog_feature_7,hog_feature_8,hog_feature_9,...,hog_feature_4456,hog_feature_4457,hog_feature_4458,hog_feature_4459,hog_feature_4460,hog_feature_4461,hog_feature_4462,hog_feature_4463,is_valid,timestamp
0,0.214913,0.291903,0.4,0.280950,0.4,0.168004,0.001957,0.002418,0.012968,0.026675,...,0.0,0.0,0.0,0.0,0.09428,0.003171,0.001390,0.000563,1.0,0.000000
1,0.142812,0.279285,0.4,0.285664,0.4,0.000000,0.012717,0.001853,0.004988,0.008050,...,0.0,0.0,0.0,0.0,0.09428,0.003286,0.001372,0.000551,1.0,0.033333
2,0.166352,0.261268,0.4,0.329642,0.4,0.009248,0.004123,0.002451,0.000000,0.022759,...,0.0,0.0,0.0,0.0,0.04714,0.000599,0.000273,0.000087,1.0,0.066666
3,0.107585,0.290077,0.4,0.273153,0.4,0.000887,0.001638,0.001351,0.010300,0.007058,...,0.0,0.0,0.0,0.0,0.04714,0.000754,0.000288,0.000089,1.0,0.099999
4,0.087141,0.269460,0.4,0.153291,0.4,0.000000,0.002502,0.001225,0.004605,0.010587,...,0.0,0.0,0.0,0.0,0.04714,0.000643,0.000202,0.000065,1.0,0.133332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24716,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,-1.0,823.858428
24717,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,-1.0,823.891761
24718,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,-1.0,823.925094
24719,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,-1.0,823.958427
