In [3]:
import pandas as pd
import numpy as np
import os
import sys

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [5]:
def preprocess_data(x_train, y_train, x_test, y_test):
    """
    Preprocess the data by applying z-normalization and reshaping if necessary.
    
    Args:
        x_train: Training data
        x_test: Test data
        univariate: Whether the data is univariate (default: False)
    
    Returns:
        Tuple containing preprocessed x_train and x_test
    """
    # Transform the labels from integers to one hot vectors
    enc = OneHotEncoder(categories='auto')
    enc.fit(np.concatenate((y_train, y_test), axis=0).reshape(-1, 1))
    y_train = enc.transform(y_train.reshape(-1, 1)).toarray()
    y_test = enc.transform(y_test.reshape(-1, 1)).toarray()

    if len(x_train.shape) == 2:  # if univariate
        # add a dimension to make it multivariate with one dimension 
        x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
        x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

    return x_train, y_train, x_test, y_test, enc

In [6]:
cur_root_dir = "/home/fmg2/v-thanh/Code/datasets"
archive_name = "NILMArchive_2025"
dataset_name = "iAWE"
root_dir_dataset = f"{cur_root_dir}/{archive_name}/{dataset_name}/train_test_np/"

# Getting the full dataset
x_train = np.load(root_dir_dataset + 'X_train_target.npy')
y_train = np.load(root_dir_dataset + 'y_train_target.npy')
x_test = np.load(root_dir_dataset + 'X_test_target.npy')
y_test = np.load(root_dir_dataset + 'y_test_target.npy')
print(f"Shape of dataset: {x_train.shape}, {y_train.shape}, {x_test.shape}, {y_test.shape}")

X_total = np.concatenate((x_train, x_test), axis=0)
y_total = np.concatenate((y_train, y_test), axis=0)
print(f"Shape of total dataset: {X_total.shape}, {y_total.shape}")

Shape of dataset: (3149291, 120, 5), (3149291,), (787323, 120, 5), (787323,)
Shape of total dataset: (3936614, 120, 5), (3936614,)


In [7]:
preprocessed_x_train, preprocessed_y_train, preprocessed_x_test, preprocessed_y_test, enc = preprocess_data(
    x_train, y_train, x_test, y_test
)

In [None]:
X_total[:, :, [0, 1, 3, 4]]

In [None]:
x_target, x_surro, y_target, y_surro = train_test_split(
    X_total, y_total, test_size=0.2, random_state=42, stratify=y_total
)

print(f"Shape of the target data: {x_target.shape}, {y_target.shape}")
print(f"Shape of the surrogate data: {x_surro.shape}, {y_surro.shape}")

In [None]:
x_train_target, x_test_target, y_train_target, y_test_target = train_test_split(X_total, y_total, test_size=0.2, random_state=42)
print(f"Shape of the target train data: {x_train_target.shape}, {y_train_target.shape}")
print(f"Shape of the target test data: {x_test_target.shape}, {y_test_target.shape}")

# Save the target data
np.save(root_dir_dataset + 'X_train_target.npy', x_train_target)
np.save(root_dir_dataset + 'y_train_target.npy', y_train_target)
np.save(root_dir_dataset + 'X_test_target.npy', x_test_target)
np.save(root_dir_dataset + 'y_test_target.npy', y_test_target)

In [None]:
x_train_surro, x_test_surro, y_train_surro, y_test_surro = train_test_split(x_surro, y_surro, test_size=0.2, random_state=42)
print(f"Shape of the surrogate train data: {x_train_surro.shape}, {y_train_surro.shape}")
print(f"Shape of the surrogate test data: {x_test_surro.shape}, {y_test_surro.shape}")

# Save the surrogate data
np.save(root_dir_dataset + 'X_train_surro.npy', x_train_surro)
np.save(root_dir_dataset + 'y_train_surro.npy', y_train_surro)
np.save(root_dir_dataset + 'X_test_surro.npy', x_test_surro)
np.save(root_dir_dataset + 'y_test_surro.npy', y_test_surro)