In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import re

In [2]:
labels = pd.read_csv("empathy_scores.csv", encoding="ISO-8859-1")
labels = labels[['Participant nr', 'Total Score original']]

avg_score = labels['Total Score original'].mean()

labels['Total Score original'] = labels['Total Score original'].apply(lambda x: 1 if x > avg_score else 0)

labels = labels.rename(columns={"Participant nr":"Participant", "Total Score original":"Label"})

In [3]:
dataframes = []
data_labels = []
max_length = 10000
data_path = 'data/grey_blue'
filename_pattern = re.compile(r"grey_blue_participant_(\d+)_trial_(\d+)")


# Making a list of dataframes and adding labels
for filename in os.listdir(data_path):
    match = filename_pattern.match(filename)
    if match:
        participant_id = int(match.group(1))
        matched_rows = labels.loc[labels['Participant'] == participant_id]
        if len(matched_rows) > 0:
            label = matched_rows.iloc[0].values
            data_labels.append(label)
        else:
            print(f"Warning: no matching row found for participant {participant_id}")
        
    df = pd.read_csv(os.path.join(data_path, filename))
    dataframes.append(df)

In [4]:
df = dataframes[1]
df[['Validity right']]

Unnamed: 0,Validity right
0,1
1,1
2,1
3,1
4,1
...,...
7201,1
7202,1
7203,1
7204,1


In [5]:
data_labels = pd.DataFrame(data_labels)
data_labels.shape

(86, 2)

In [6]:
data_labels = data_labels[[1]]

In [7]:
data_labels

Unnamed: 0,1
0,0
1,0
2,0
3,0
4,1
...,...
81,0
82,0
83,0
84,0


In [8]:
num_files = len(dataframes)
# Number of features is based on the first DataFrame's columns
num_features = dataframes[0].shape[1] if num_files > 0 else 0

print(num_files)
print(num_features)

86
40


In [9]:
cleaned_arrays = []
fill_value = 0
for df in dataframes:
    arr = df.to_numpy()  # shape: (time, features)
    T, F = arr.shape
    if T >= max_length:
        arr = arr[:max_length, :]
    elif T < max_length:
        padded = np.full((max_length, F), fill_value, dtype=arr.dtype)
        padded[:T, :] = arr
        arr = padded
    cleaned_arrays.append(arr)


In [10]:
data_tensor = np.stack(cleaned_arrays, axis=0)
print(data_tensor.shape)

(86, 10000, 40)


**Making all the sliding windows**

In [34]:
def apply_sliding_window(data, window_size=100, step_size=50):
    num_samples, time_length, num_features = data.shape
    windows = []

    for sample in range(num_samples):
        for start in range(0, time_length - window_size + 1, step_size):
            windows.append(data[sample, start:start + window_size, :])

    return np.array(windows)

window_size = 500
step_size = 50

# Apply sliding window
windowed_data = apply_sliding_window(data_tensor, window_size, step_size)
print(windowed_data.shape)  # Expected: (86 * num_windows, 100, 40)


(16426, 500, 40)


In [35]:
# 70/30 train test split
num_train = int(0.7 * windowed_data.shape[0])

data_train = windowed_data[:num_train, :, :] # :43 if train and val
data_train.shape

(11498, 500, 40)

In [36]:
data_test = windowed_data[num_train:, :, :]

data_test.shape

(4928, 500, 40)

In [37]:
np.save("data/tensors/data_train.npy", data_train)
np.save("data/tensors/data_test.npy", data_test)

In [39]:
num_windows_per_sample = int(((data_tensor.shape[1] - window_size) / step_size) + 1)
expanded_labels = np.repeat(data_labels, num_windows_per_sample, axis=0)

In [42]:
Y_train = expanded_labels[:num_train]
Y_test = expanded_labels[num_train:]

In [43]:
np.save("labels/Y_train.npy", Y_train)
np.save("labels/Y_test.npy", Y_test)