In [93]:
from scipy.signal import decimate
import h5py
import numpy as np
import os

In [106]:
VERBOSE = 0  # constant for debugging, higher number means more printing
TRAIN_VAL_SPLIT = 6
DOWNSAMPLING_FACTOR = 10

In [94]:
def log(message: tuple, verbose_true: int, verbose_min):
    """Helper print function, based on verbose params."""
    if verbose_true >= verbose_min:
        for entry in message:
            print(entry, end=' ')
        print()

In [95]:
def load_data(filename):
    """Load data from .h5 file."""
    with h5py.File(filename, 'r') as f:
        data = f[next(iter(f.keys()))][()]
    return data

In [96]:
def data_loader(input_dir: str) -> tuple[np.ndarray, str]:
    """Generator for loading data and its filename from all .h5 files in a directory."""
    for file in os.listdir(input_dir):
        if not file.endswith('.h5'):
            continue
        path = os.path.join(input_dir, file)
        data = load_data(path)
        yield data, file

In [97]:
def create_data_file(data, output_dir, filename: str):
    """Create .h5 file with given data."""
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)
    with h5py.File(output_path, 'w') as f:
        f.create_dataset(filename.rsplit('.', 1)[0], data=data)

In [98]:
def get_file_info(filename: str) -> tuple[str, int]:
    filename = filename.rsplit('.', 1)[0].split('_')
    task, _, window = filename[-3], filename[-2], filename[-1]
    return task, int(window)

### Preprocessing

In [99]:
def downsample(data, factor):
    """Downsample data based on factor."""
    return decimate(data, factor, axis=1)

In [100]:
def get_means(data) -> np.ndarray:
    """Get means for every sensor dimension."""
    return np.mean(data, axis=1, keepdims=True)

In [101]:
def get_stds(data) -> np.ndarray:
    """Get standard deviations for every sensor dimension."""
    return np.std(data, axis=1, keepdims=True)

In [102]:
def z_score_normalize(data, means, stds):
    """Z-score normalization for every sensor dimension."""
    return (data - means) / stds

In [103]:
def downsample_and_save(input_dir, output_dir, factor, verbose):
    """Downsample data in each file and save them in a new file."""
    for data, file in data_loader(input_dir):
        data = downsample(data, factor)
        create_data_file(data, output_dir, f"ds_{file}")

In [104]:
def normalize_and_save(input_dir, output_dir, means, stds, verbose):
    """Normalize data in each file and save them in a new file."""
    for data, file in data_loader(input_dir):
        data = z_score_normalize(data, means, stds)
        create_data_file(data, output_dir, f"norm_{file}")

In [105]:
def concat_together(input_dir):
    """
    Concatenate data from files into a single time window of sensor readings.
    Also split to a train and validation sets.
    """
    train_data = None
    val_data = None
    for data, file in data_loader(input_dir):
        _, window = get_file_info(file)
        if window <= TRAIN_VAL_SPLIT and train_data is None:
            train_data = data
        elif window <= TRAIN_VAL_SPLIT:
            train_data = np.concat((train_data, data), axis=1)
        elif val_data is None:
            val_data = data
        else:
            val_data = np.concat((train_data, data), axis=1)
    return train_data, val_data

In [107]:
downsample_and_save('Intra/train', 'Intra/train_ds', DOWNSAMPLING_FACTOR, VERBOSE)

In [108]:
data_train, _ = concat_together('Intra/train_ds')
means, stds = get_means(data_train), get_stds(data_train)

In [111]:
normalize_and_save('Intra/train_ds', 'Intra/train_ds_norm', means, stds, VERBOSE)

### Training

In [112]:
def concat_as_samples(input_dir):
    """
    Concatenate data from files, treating each data file as a window sample
    of sensor readings. Also split them to train and validation sets.
    """
    samples_train = []
    samples_val = []
    for data, file in data_loader(input_dir):
        _, window = get_file_info(file)
        if window <= TRAIN_VAL_SPLIT:
            samples_train.append(data)
        else:
            samples_val.append(data)
    return np.stack(samples_train), np.stack(samples_val)

In [113]:
data_train, data_val = concat_as_samples('Intra/train_ds_norm')
print(np.shape(data_train))
print(np.shape(data_val))

(24, 248, 3563)
(8, 248, 3563)
