#This files' Purpose is to create datasets of various sizes by removing certain sensors for the experiment exploring impact of misalignemnt on decreased number of sensors

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import gc

# -----------------------------
# Purpose:
# This script loads and preprocesses the REALDISP dataset from Google Drive.
# It performs the following steps:
# 1. Loads sensor data from 17 subjects and splits it into train, validation, and test sets.
# 2. Removes rows labeled as '0' (non-activity/background).
# 3. Converts data to float and organizes it into DataFrames.
# 4. Extracts features (excluding timestamps and labels).
# 5. Removes quaternion-related columns.
# -----------------------------

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Initialize data lists for each dataset split
train_data = []
valid_data = []
test_data = []

# Step 2: Read and split data by subject
for i in range(1, 18):  # Subjects 1 through 17
    file_path = f"/content/drive/My Drive/PROJECT/REALDISP/subject{i}_ideal.log"
    with open(file_path, "r") as file:
        for line in file:
            split_line = line.split()
            if split_line[-1] != "0":  # Skip rows with label '0' (non-activity)
                if i < 12:
                    train_data.append(split_line)  # Subjects 1–11 → Train
                elif i < 15:
                    valid_data.append(split_line)  # Subjects 12–14 → Validation
                else:
                    test_data.append(split_line)   # Subjects 15–17 → Test

# Step 3: Convert to DataFrames and cast to float
df_train = pd.DataFrame(train_data).astype(float)
df_valid = pd.DataFrame(valid_data).astype(float)
df_test = pd.DataFrame(test_data).astype(float)

print("Train data shape:", df_train.shape)
print("Validation data shape:", df_valid.shape)
print("Test data shape:", df_test.shape)

# Step 4: Extract features (excluding timestamp and label columns)
train_features = df_train.iloc[:, 2:-1]
train_labels = df_train.iloc[:, -1]

valid_features = df_valid.iloc[:, 2:-1]
valid_labels = df_valid.iloc[:, -1]

test_features = df_test.iloc[:, 2:-1]
test_labels = df_test.iloc[:, -1]

# -----------------------------
# Step 5: Identify and remove quaternion columns
# Each 13-column block includes 4 quaternion features (to be removed)
columns_to_remove = []
start = 9        # First quaternion column (0-based index)
step = 13        # Distance between quaternion blocks
remove_count = 4 # Number of consecutive columns to remove per block

# Collect indices to remove
while start <= train_features.shape[1]:
    columns_to_remove.extend(range(start, start + remove_count))
    start += step

# Filter indices to stay within bounds
columns_to_remove = [col for col in columns_to_remove if col < train_features.shape[1]]

# Remove quaternion columns from all splits
train_features = train_features.drop(columns=train_features.columns[columns_to_remove])
valid_features = valid_features.drop(columns=valid_features.columns[columns_to_remove])
test_features = test_features.drop(columns=test_features.columns[columns_to_remove])

# Final shapes for verification
print("Train features shape:", train_features.shape)
print("Validation features shape:", valid_features.shape)
print("Test features shape:", test_features.shape)

Mounted at /content/drive
(435064, 120)
(153507, 120)
(123251, 120)
(435064, 81)
(153507, 81)
(123251, 81)


#Remove Sensors

In [None]:
# For each Dataset a certain set of sensors to remove

dataset_1_sensors_to_remove = [2,7,9]  # Right and Left Upper Arms, Left Thigh
dataset_2_sensors_to_remove = [3,6,8]  # Right and Left Lower Arms, Right Calf
dataset_3_sensors_to_remove = [1,3,4,5,6,8]  # Opposite to dataset 1
dataset_4_sensors_to_remove = [1,2,4,5,7,9]  # Opposite to dataset 2

In [None]:
def remove_sensors(features, sensor_indices):
    """
    Removes specified sensors from the feature array.

    Args:
        features (np.ndarray): Input feature array of shape (n_samples, 81).
        sensor_indices (list): List of sensor indices to remove (values from 1 to 9).

    Returns:
        np.ndarray: Features with specified sensors removed.
    """
    num_sensors = 9  # Total sensors
    sensor_length = 9  # Features per sensor

    # Convert sensor indices (1-based) to column index ranges
    cols_to_remove = []
    for sensor in sensor_indices:
        start_col = (sensor - 1) * sensor_length  # Start index
        end_col = start_col + sensor_length      # End index (exclusive)
        cols_to_remove.extend(range(start_col, end_col))

    # Remove selected sensor columns
    features_reduced = np.delete(features, cols_to_remove, axis=1)

    return features_reduced

In [None]:
train_features = remove_sensors(train_features, dataset_1_sensors_to_remove)
valid_features = remove_sensors(valid_features, dataset_1_sensors_to_remove)
test_features = remove_sensors(test_features, dataset_1_sensors_to_remove)

In [None]:
print(train_features.shape)
print(valid_features.shape)
print(test_features.shape)

(435064, 54)
(153507, 54)
(123251, 54)


#Segment and shuffle data

In [None]:
def create_variable_size_windows(features, labels, min_size=96, max_size=128):

    # Convert DataFrames to NumPy arrays if necessary
    if not isinstance(features, np.ndarray):
        features = features.to_numpy()
    if not isinstance(labels, np.ndarray):
        labels = labels.to_numpy()

    n_samples, n_features = features.shape
    windows = []
    label_windows = []

    # Step 1: Create variable-size windows
    i = 0
    while i < n_samples - min_size:
        # Randomly choose a window size between min_size and max_size
        window_size = np.random.randint(min_size, max_size + 1)
        #window_sizes.append(window_size)

        # Ensure we do not exceed dataset length
        if i + window_size > n_samples:
            break

        # Extract window and corresponding labels
        windows.append(features[i : i + window_size, :])  # Now works correctly
        label_windows.append(labels[i : i + window_size])

        # Move to the next window
        i += window_size

    # Convert lists to numpy arrays
    windows = np.array(windows, dtype=object)  # Use dtype=object for variable-length windows
    label_windows = np.array(label_windows, dtype=object)

    # Step 2: Shuffle the windows
    shuffled_indices = np.random.permutation(len(windows))
    windows = windows[shuffled_indices]
    label_windows = label_windows[shuffled_indices]

    # Step 3: Flatten the shuffled windows back into original shape
    shuffled_features = np.vstack(windows)  # Stack into a 2D array
    shuffled_labels = np.concatenate(label_windows)  # Flatten into 1D array

    return shuffled_features.astype(np.float32), shuffled_labels.astype(np.float32)

# Example Usage:
# Assuming train_features, train_labels, valid_features, valid_labels, test_features, test_labels exist
shuffled_train_features, shuffled_train_labels = create_variable_size_windows(train_features, train_labels, min_size=128, max_size=128)
shuffled_valid_features, shuffled_valid_labels = create_variable_size_windows(valid_features, valid_labels, min_size=128, max_size=128)
shuffled_test_features, shuffled_test_labels = create_variable_size_windows(test_features, test_labels, min_size=128, max_size=128)

X_train_normal = shuffled_train_features
y_train_normal = shuffled_train_labels
X_valid = shuffled_valid_features
y_valid = shuffled_valid_labels
X_test = shuffled_test_features
y_test = shuffled_test_labels

In [None]:
print(X_train_normal.shape)
print(y_train_normal.shape)
print(X_valid.shape)
print(y_valid.shape)
print(X_test.shape)
print(y_test.shape)

(434944, 54)
(434944,)
(153472, 54)
(153472,)
(123136, 54)
(123136,)


In [None]:
# Collect garbage

del shuffled_train_features, shuffled_train_labels
del shuffled_valid_features, shuffled_valid_labels
del shuffled_test_features, shuffled_test_labels
del train_features, train_labels
del valid_features, valid_labels
del test_features, test_labels
del df_train, df_valid, df_test

gc.collect()

192

# Save processed dataset with required number of sensors

In [None]:
from scipy.io import savemat

data_dict = {
    "trainData": X_train,
    "valData": X_valid,
    "testData": X_test,
    "trainLabels": y_train,
    "valLabels": y_valid,
    "testLabels": y_test,
}

savemat("REALDISP_6_SENSOR_SET.mat", data_dict, do_compression=True)