In [57]:
from scipy.signal import decimate
import h5py
import numpy as np
import os

In [58]:
def log(message: tuple, verbose_true: int, verbose_min):
    """Helper print function, based on verbose params."""
    if verbose_true >= verbose_min:
        for entry in message:
            print(entry, end=' ')
        print()

In [59]:
def load_data(filename):
    """Load data from .h5 file."""
    with h5py.File(filename, 'r') as f:
        data = f[next(iter(f.keys()))][()]
    return data

In [76]:
def data_loader(input_dir: str) -> tuple[np.ndarray, str]:
    """Generator for loading data and its filename from all .h5 files in a directory."""
    for file in os.listdir(input_dir):
        if not file.endswith('.h5'):
            continue
        path = os.path.join(input_dir, file)
        data = load_data(path)
        yield data, file

In [61]:
def create_data_file(data, output_dir, filename: str):
    """Create .h5 file with given data."""
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)
    with h5py.File(output_path, 'w') as f:
        f.create_dataset(filename.rsplit('.', 1)[0], data=data)

### Preprocessing

In [62]:
def downsample(data, factor):
    """Downsample data based on factor."""
    return decimate(data, factor, axis=1)

In [63]:
def find_means_and_stds(data) -> tuple[np.ndarray, np.ndarray]:
    """Get means and stds for every sensor dimension."""
    means = np.mean(data, axis=1, keepdims=True)
    stds = np.std(data, axis=1, keepdims=True)
    return means, stds

In [64]:
def z_score_normalize(data, means, stds):
    """Z-score normalization for every sensor dimension."""
    return (data - means) / stds

In [65]:
def downsample_and_save(input_dir, output_dir, factor, verbose):
    """Downsample data in each file and save them in a new file."""
    for data, file in data_loader(input_dir):
        data = downsample(data, factor)
        create_data_file(data, output_dir, f"ds_{file}")

In [66]:
def normalize_and_save(input_dir, output_dir, means, stds, verbose):
    """Normalize data in each file and save them in a new file."""
    for data, file in data_loader(input_dir):
        data = z_score_normalize(data, means, stds)
        create_data_file(data, output_dir, f"norm_{file}")

In [67]:
def concat_together(input_dir):
    """Concatenate data from files into a single time window of sensor readings."""
    data_concatenated = None
    for data, file in data_loader(input_dir):
        if data_concatenated is None:
            data_concatenated = data
        else:
            data_concatenated = np.concat((data_concatenated, data), axis=1)
    return data_concatenated

In [69]:
VERBOSE = 0  # constant for debugging, higher number means more printing

In [70]:
downsample_and_save('Intra/train', 'Intra/train_ds', 10, VERBOSE)

In [71]:
data_train = concat_together('Intra/train_ds')
means, stds = find_means_and_stds(data_train)

In [72]:
normalize_and_save('Intra/train_ds', 'Intra/train_ds_norm', means, stds, VERBOSE)

### Training

In [88]:
def concat_as_samples(input_dir):
    """
    Concatenate data from files, treating each data file as a window sample of sensor readings.
    Also split them to train and validation sets.
    """
    samples_train = []
    samples_val = []
    for data, file in data_loader(input_dir):
        file = int(file.split('.')[0].split('_')[-1])
        if file <= 6:
            samples_train.append(data)
        else:
            samples_val.append(data)
    return np.stack(samples_train), np.stack(samples_val)

In [89]:
data_train, data_val = concat_as_samples('Intra/train_ds_norm')
print(np.shape(data_train))
print(np.shape(data_val))

(24, 248, 3563)
(8, 248, 3563)
