In [1]:
import os
import h5py
import time
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib import gridspec
%matplotlib inline

In [2]:
files = ["../../data/N-CMAPSS_DS01-005.h5", "../../data/N-CMAPSS_DS02-006.h5", "../../data/N-CMAPSS_DS03-012.h5", "../../data/N-CMAPSS_DS04.h5", "../../data/N-CMAPSS_DS05.h5", "../../data/N-CMAPSS_DS06.h5", "../../data/N-CMAPSS_DS07.h5", "../../data/N-CMAPSS_DS08a-009.h5", "../../data/N-CMAPSS_DS08c-008.h5"]

In [3]:
def extract_columns(filename):
    with h5py.File(filename, 'r') as infile:
        # get columns from file
        W_var = np.array(infile.get('W_var'))
        X_s_var = np.array(infile.get('X_s_var'))  
        X_v_var = np.array(infile.get('X_v_var')) 
        T_var = np.array(infile.get('T_var'))
        A_var = np.array(infile.get('A_var'))

    # from np.array to list dtype U4/U5
    W_var = list(np.array(W_var, dtype='U20'))
    X_s_var = list(np.array(X_s_var, dtype='U20'))  
    X_v_var = list(np.array(X_v_var, dtype='U20')) 
    T_var = list(np.array(T_var, dtype='U20'))
    A_var = list(np.array(A_var, dtype='U20'))

    return W_var + X_s_var + X_v_var + T_var + A_var 

def extract_data(filename, get_cols=False):
    with h5py.File(filename, 'r') as infile:
        # train data
        W_dev = np.array(infile.get('W_dev'))          
        X_s_dev = np.array(infile.get('X_s_dev'))       
        X_v_dev = np.array(infile.get('X_v_dev'))      
        T_dev = np.array(infile.get('T_dev'))       
        y_train = np.array(infile.get('Y_dev'))           
        A_dev = np.array(infile.get('A_dev')) 
        x_train = np.concatenate((W_dev, X_s_dev, X_v_dev, T_dev, A_dev), axis=1)

        # test data
        W_test = np.array(infile.get('W_test'))        
        X_s_test = np.array(infile.get('X_s_test'))     
        X_v_test = np.array(infile.get('X_v_test'))      
        T_test = np.array(infile.get('T_test'))         
        y_test = np.array(infile.get('Y_test'))         
        A_test = np.array(infile.get('A_test')) 
        x_test = np.concatenate((W_test, X_s_test, X_v_test, T_test, A_test), axis=1)

    # concat data together for EDA
    #X = np.concatenate((x_train, x_test), axis=0)
    #y = np.concatenate((y_train, y_test), axis=0)
    
    # possibly extract columns
    cols = None
    if get_cols:
        cols = extract_columns(filename)

    return {"data": (x_train, x_test, y_train, y_test), "columns": cols}

def read_data(files):
    data = extract_data(files[0], get_cols=True)
    columns = data["columns"]
    X, y = data["data"]
    for filename in files[1:]:
        X_temp, y_temp = extract_data(filename)["data"]
        X, y = np.concatenate((X, X_temp), axis=0), np.concatenate((y, y_temp), axis=0)
    return X, y, columns

In [4]:
def merge_files(files, output_file, id_cols, chunk_size=10000):
    train_file = output_file + "_train.h5"
    test_file = output_file + "_test.h5"
    h5_train = h5py.File(train_file, "w")
    h5_test = h5py.File(test_file, "w")

    train_size, test_size = 0, 0

    for i, filename in enumerate(files):
        print(f"Processing {filename} ({i+1}/{len(files)})...")

        data = extract_data(filename, get_cols=i==0)  # Read one file at a time
        X_train, X_test, y_train, y_test = data["data"]
        y_train, y_test = y_train.squeeze(), y_test.squeeze()

        if i == 0:
            columns = data["columns"]
            id_idxs = [columns.index(col) for col in id_cols]
            # make starting datasets
            h5_train.create_dataset("X", shape=(0, X_train.shape[1]), maxshape=(None, X_train.shape[1]), dtype='float32', compression="gzip", chunks=(chunk_size, X_train.shape[1]))
            h5_train.create_dataset("y", shape=(0,), maxshape=(None,), dtype='float32', compression="gzip", chunks=(chunk_size,))

            h5_test.create_dataset("X", shape=(0, X_test.shape[1]), maxshape=(None, X_test.shape[1]), dtype='float32', compression="gzip", chunks=(chunk_size, X_test.shape[1]))
            h5_test.create_dataset("y", shape=(0,), maxshape=(None,), dtype='float32', compression="gzip", chunks=(chunk_size,))

            h5_train.attrs["columns"] = np.array(columns, dtype="S")
            h5_test.attrs["columns"] = np.array(columns, dtype="S")



        # append data incrementally to avoid memory issues
        # fix shapes
        h5_train["X"].resize((train_size + X_train.shape[0]), axis=0)
        h5_train["X"][train_size:] = X_train
        h5_train["y"].resize((train_size + y_train.shape[0]), axis=0)
        h5_train["y"][train_size:] = y_train

        h5_test["X"].resize((test_size + X_test.shape[0]), axis=0)
        h5_test["X"][test_size:] = X_test
        h5_test["y"].resize((test_size + y_test.shape[0]), axis=0)
        h5_test["y"][test_size:] = y_test

        train_size += X_train.shape[0]
        test_size += X_test.shape[0]

    # cleanup
    h5_train.close()
    h5_test.close()


In [5]:
merge_files(files, "engine_data")

Processing N-CMAPSS_DS01-005.h5 (1/9)...
Processing N-CMAPSS_DS02-006.h5 (2/9)...
Processing N-CMAPSS_DS03-012.h5 (3/9)...
Processing N-CMAPSS_DS04.h5 (4/9)...
Processing N-CMAPSS_DS05.h5 (5/9)...
Processing N-CMAPSS_DS06.h5 (6/9)...
Processing N-CMAPSS_DS07.h5 (7/9)...
Processing N-CMAPSS_DS08a-009.h5 (8/9)...
Processing N-CMAPSS_DS08c-008.h5 (9/9)...


In [6]:
def merge_files(files, output_file, id_cols, chunk_size=10000):
    """
    Merges multiple HDF5 files into a single train/test HDF5 dataset and creates an index.csv file.

    Parameters:
        files (list): List of HDF5 file paths to merge.
        output_file (str): Output HDF5 file path (without extension).
        id_cols (list): List of column names identifying a time series (e.g., ["unit", "cycle"]).
        chunk_size (int): Number of rows to process in each batch (to prevent memory issues).
    """
    train_file = output_file + "_train.h5"
    test_file = output_file + "_test.h5"
    h5_train = h5py.File(train_file, "w")
    h5_test = h5py.File(test_file, "w")

    train_size, test_size = 0, 0
    index_data = []  # Store (unit, cycle, start, stop, dataset_type)

    for i, filename in enumerate(files):
        print(f"Processing {filename} ({i+1}/{len(files)})...")

        data = extract_data(filename, get_cols=(i == 0))  # Read one file at a time
        X_train, X_test, y_train, y_test = data["data"]
        y_train, y_test = y_train.squeeze(), y_test.squeeze()

        if i == 0:
            columns = data["columns"]

            # Create datasets
            h5_train.create_dataset("X", shape=(0, X_train.shape[1]), maxshape=(None, X_train.shape[1]), 
                                    dtype='float32', compression="gzip", chunks=(chunk_size, X_train.shape[1]))
            h5_train.create_dataset("y", shape=(0,), maxshape=(None,), dtype='float32', compression="gzip", 
                                    chunks=(chunk_size,))

            h5_test.create_dataset("X", shape=(0, X_test.shape[1]), maxshape=(None, X_test.shape[1]), 
                                   dtype='float32', compression="gzip", chunks=(chunk_size, X_test.shape[1]))
            h5_test.create_dataset("y", shape=(0,), maxshape=(None,), dtype='float32', compression="gzip", 
                                   chunks=(chunk_size,))

            h5_train.attrs["columns"] = np.array(columns, dtype="S")
            h5_test.attrs["columns"] = np.array(columns, dtype="S")

        # ** Extract indices of ID columns **
        id_idxs = [columns.index(col) for col in id_cols]

        # ** Extract unique (unit, cycle) time series for train and test sets **
        for dataset_type, X_data, base_index in [("train", X_train, train_size), ("test", X_test, test_size)]:
            if X_data.shape[0] == 0:
                continue  # Skip if there's no data

            # Load only unit and cycle columns
            id_values = X_data[:, id_idxs]
            id_df = pd.DataFrame(id_values, columns=id_cols)

            # Get start and stop indices for each unique (unit, cycle)
            grouped = id_df.groupby(id_cols).apply(lambda df: (df.index.min(), df.index.max()))
            for (unit, cycle), (start, stop) in grouped.items():
                index_data.append([unit, cycle, start + base_index, stop + base_index, dataset_type])

        # ** Append data incrementally to avoid memory issues **
        h5_train["X"].resize((train_size + X_train.shape[0]), axis=0)
        h5_train["X"][train_size:] = X_train
        h5_train["y"].resize((train_size + y_train.shape[0]), axis=0)
        h5_train["y"][train_size:] = y_train

        h5_test["X"].resize((test_size + X_test.shape[0]), axis=0)
        h5_test["X"][test_size:] = X_test
        h5_test["y"].resize((test_size + y_test.shape[0]), axis=0)
        h5_test["y"][test_size:] = y_test

        train_size += X_train.shape[0]
        test_size += X_test.shape[0]

    # ** Save the index file as CSV **
    index_df = pd.DataFrame(index_data, columns=["unit", "cycle", "start_idx", "stop_idx", "dataset"])
    index_df.to_csv(output_file + "_index.csv", index=False)
    print(f"✅ Index file saved: {output_file}_index.csv")

    # Cleanup
    h5_train.close()
    h5_test.close()

In [7]:
merge_files(files, "engine", id_cols=["unit", "cycle"])

Processing N-CMAPSS_DS01-005.h5 (1/9)...
Processing N-CMAPSS_DS02-006.h5 (2/9)...
Processing N-CMAPSS_DS03-012.h5 (3/9)...
Processing N-CMAPSS_DS04.h5 (4/9)...
Processing N-CMAPSS_DS05.h5 (5/9)...
Processing N-CMAPSS_DS06.h5 (6/9)...
Processing N-CMAPSS_DS07.h5 (7/9)...
Processing N-CMAPSS_DS08a-009.h5 (8/9)...
Processing N-CMAPSS_DS08c-008.h5 (9/9)...
✅ Index file saved: engine_index.csv


In [6]:
import numpy as np
import pandas as pd
import h5py

def merge_files_timewindows(
    files,
    output_file,
    id_cols,
    window_size=50,
    overlap=10,
    chunk_size=10000
):
    """
    Merges multiple HDF5 files into train/test windowed HDF5 datasets.
    The output files contain shape (num_windows, window_size, num_features).

    Parameters:
        files (list of str): List of HDF5 file paths to merge.
        output_file (str): Basename for output HDF5 files (no ".h5" extension).
                           This will produce "{output_file}_train_windows.h5" and
                           "{output_file}_test_windows.h5".
        id_cols (list of str): Column names identifying each time series, e.g. ["unit", "cycle"].
        window_size (int): Size of each time window (# of observations).
        overlap (int): Number of overlapping rows between consecutive windows.
        chunk_size (int): Used when creating chunked HDF5 datasets. Does NOT
                          control reading chunk size from the input files,
                          but sets chunk shape in the output HDF5.
    """
    # ----------------------------------------------------------------------
    # 1. Prepare output HDF5 files for train and test windows
    # ----------------------------------------------------------------------
    train_file = output_file + "_train_windows.h5"
    test_file = output_file + "_test_windows.h5"
    h5_train = h5py.File(train_file, "w")
    h5_test = h5py.File(test_file, "w")

    # We'll create empty placeholders; we don't know the final # of windows yet.
    X_train_dset = None
    y_train_dset = None
    X_test_dset = None
    y_test_dset = None

    train_count = 0  # How many train windows we've appended so far
    test_count = 0   # How many test windows we've appended so far

    # Step size for time windows (controls overlap)
    step_size = window_size - overlap

    # ----------------------------------------------------------------------
    # 2. Helper function for time-window generation on a single ID group
    # ----------------------------------------------------------------------
    def create_time_windows(X_sub, y_sub):
        """
        Given the X and y arrays for a single ID group (time-series),
        generate all windows with shape (num_windows, window_size, num_features).
        For the label, we take y at the *end* of each window (common RUL approach).
        """
        windows_list = []
        labels_list = []

        n_samples = X_sub.shape[0]
        # Go up to n_samples - window_size + 1
        for start_idx in range(0, n_samples - window_size + 1, step_size):
            end_idx = start_idx + window_size
            windows_list.append(X_sub[start_idx:end_idx])
            # Label often taken from the last row in the window
            labels_list.append(y_sub[end_idx - 1])

        if len(windows_list) == 0:
            return np.array([]), np.array([])

        # Stack them into final arrays
        Xw = np.stack(windows_list, axis=0)  # shape: (num_windows, window_size, num_features)
        yw = np.array(labels_list, dtype=Xw.dtype)  # shape: (num_windows,)
        return Xw, yw

    # ----------------------------------------------------------------------
    # 3. Process each file in 'files'
    #    We assume you have a function `extract_data(h5_path, get_cols=False)`
    #    that returns { "data": (X_train, X_test, y_train, y_test), "columns": [...] }
    # ----------------------------------------------------------------------
    for i, filename in enumerate(files):
        print(f"Processing {filename} ({i+1}/{len(files)})...")

        # ----------------------------------------------------------------------
        # 3a. Extract the data from the source file
        # ----------------------------------------------------------------------
        # You need to implement or already have `extract_data`; it typically does:
        #   X_train, X_test, y_train, y_test = ...
        #   columns = [...]
        #   return { "data": (X_train, X_test, y_train, y_test), "columns": columns }
        data_dict = extract_data(filename, get_cols=True)
        X_train_raw, X_test_raw, y_train_raw, y_test_raw = data_dict["data"]
        y_train_raw = y_train_raw.squeeze()
        y_test_raw = y_test_raw.squeeze()
        columns = data_dict["columns"]
        print(columns)

        # If this is the first file, create the output datasets
        if i == 0:
            # We remove the ID columns + target from the "feature" set
            id_idxs = [columns.index(col) for col in id_cols]
            target_idx = columns.index("RUL") if "RUL" in columns else None

            feature_cols = [
                c for c in range(len(columns))
                if (c not in id_idxs) and (c != target_idx)
            ]
            num_features = len(feature_cols)

            # Create placeholders for train/test window sets
            X_train_dset = h5_train.create_dataset(
                "X", shape=(0, window_size, num_features),
                maxshape=(None, window_size, num_features),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size, window_size, num_features)
            )
            y_train_dset = h5_train.create_dataset(
                "y", shape=(0,),
                maxshape=(None,),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size,)
            )

            X_test_dset = h5_test.create_dataset(
                "X", shape=(0, window_size, num_features),
                maxshape=(None, window_size, num_features),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size, window_size, num_features)
            )
            y_test_dset = h5_test.create_dataset(
                "y", shape=(0,),
                maxshape=(None,),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size,)
            )

            # Store attribute with feature column names if you want
            # (excluding ID + target columns)
            kept_column_names = [columns[c] for c in feature_cols]
            h5_train.attrs["columns"] = np.array(kept_column_names, dtype="S")
            h5_test.attrs["columns"] = np.array(kept_column_names, dtype="S")

        # ----------------------------------------------------------------------
        # 3b. Group the train set by ID columns, generate windows, append them
        # ----------------------------------------------------------------------
        if X_train_raw.shape[0] > 0:
            # Convert to DataFrame for grouping
            df_train = pd.DataFrame(X_train_raw, columns=columns)
            df_train["RUL"] = y_train_raw

            # For each unique ID, build windows
            group_df_train = df_train.groupby(id_cols, sort=False)
            for _, sub_df in group_df_train:
                # Separate features vs. label
                # - We'll keep only the relevant feature columns
                # - We'll keep label from the 'RUL' column
                sub_X = sub_df.iloc[:, feature_cols].values  # shape (n, num_features)
                sub_y = sub_df["RUL"].values

                # Generate windows
                Xw, yw = create_time_windows(sub_X, sub_y)
                if Xw.shape[0] == 0:
                    continue

                # Append to HDF5
                n_new = Xw.shape[0]
                X_train_dset.resize(X_train_dset.shape[0] + n_new, axis=0)
                y_train_dset.resize(y_train_dset.shape[0] + n_new, axis=0)

                X_train_dset[-n_new:] = Xw
                y_train_dset[-n_new:] = yw
                train_count += n_new

        # ----------------------------------------------------------------------
        # 3c. Same logic for test set
        # ----------------------------------------------------------------------
        if X_test_raw.shape[0] > 0:
            # Convert to DataFrame for grouping
            df_test = pd.DataFrame(X_test_raw, columns=columns)
            df_test["RUL"] = y_test_raw

            group_df_test = df_test.groupby(id_cols, sort=False)
            for _, sub_df in group_df_test:
                sub_X = sub_df.iloc[:, feature_cols].values
                sub_y = sub_df["RUL"].values

                Xw, yw = create_time_windows(sub_X, sub_y)
                if Xw.shape[0] == 0:
                    continue

                n_new = Xw.shape[0]
                X_test_dset.resize(X_test_dset.shape[0] + n_new, axis=0)
                y_test_dset.resize(y_test_dset.shape[0] + n_new, axis=0)

                X_test_dset[-n_new:] = Xw
                y_test_dset[-n_new:] = yw
                test_count += n_new

        print(f"  ...done. Train windows so far: {train_count}, Test windows so far: {test_count}")

    # ----------------------------------------------------------------------
    # 4. Cleanup and Close
    # ----------------------------------------------------------------------
    h5_train.close()
    h5_test.close()
    print(f"✅ Train windows file saved: {train_file}  (total windows: {train_count})")
    print(f"✅ Test windows file saved: {test_file}   (total windows: {test_count})")


In [10]:
merge_files_timewindows(
    files=files,
    output_file="engine",         # -> produces "merged_engine_train_windows.h5" and "merged_engine_test_windows.h5"
    id_cols=["unit", "cycle"],
    window_size=50,
    overlap=5,                           # windows will have step_size=45
    chunk_size=1000                      # chunk shape for output dataset
)

Processing ../../N-CMAPSS_DS01-005.h5 (1/9)...
['alt', 'Mach', 'TRA', 'T2', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf', 'T40', 'P30', 'P45', 'W21', 'W22', 'W25', 'W31', 'W32', 'W48', 'W50', 'SmFan', 'SmLPC', 'SmHPC', 'phi', 'fan_eff_mod', 'fan_flow_mod', 'LPC_eff_mod', 'LPC_flow_mod', 'HPC_eff_mod', 'HPC_flow_mod', 'HPT_eff_mod', 'HPT_flow_mod', 'LPT_eff_mod', 'LPT_flow_mod', 'unit', 'cycle', 'Fc', 'hs']


KeyboardInterrupt: 

In [8]:
def load_from_hdf5(filename="data.h5"):
    with h5py.File(filename, "r") as f:
        X = f["X"][:]
        y = f["y"][:]
    return X, y

In [9]:
X, y = load_from_hdf5(filename="engine_train_windows.h5")

In [10]:
X.shape

(980793, 50, 44)

In [4]:
import numpy as np
import pandas as pd
import h5py

def merge_files_timewindows(
    files,
    output_file,
    id_cols,
    window_size=50,
    overlap=10,
    chunk_size=10000,
    metadata_cols=None,
    keep_metadata=True,
):
    """
    Merges multiple HDF5 files into train/test windowed HDF5 datasets,
    storing X, y, and (optionally) additional metadata in separate datasets.

    The output files each contain:
        - "X":   shape (num_windows, window_size, num_features)
        - "y":   shape (num_windows,)
        - "meta": shape (num_windows, window_size, num_metadata_features)
                  (only if keep_metadata=True and metadata_cols is non-empty)
    
    Parameters:
        files (list of str): List of HDF5 file paths to merge.
        output_file (str): Basename for output HDF5 files (no ".h5" extension).
                           This will produce "{output_file}_train_windows.h5" 
                           and "{output_file}_test_windows.h5".
        id_cols (list of str): Column names identifying each time series 
                               (e.g. ["unit", "cycle"]).
        window_size (int): Size of each time window (# of observations).
        overlap (int): Number of overlapping rows between consecutive windows.
        chunk_size (int): Chunk shape for storing the data in HDF5 
                          (affects I/O performance).
        metadata_cols (list of str): Column names to store as metadata. 
                                     These are stored in a separate dataset 
                                     and NOT fed to the model (not in X).
        keep_metadata (bool): If False, metadata is fully discarded (no "meta" 
                              dataset). This saves memory/disk if you don't 
                              need metadata.

    Example usage:
        merge_files_timewindows(
            files=["engine1.h5", "engine2.h5"],
            output_file="merged_engine",
            id_cols=["unit", "cycle"],
            window_size=50,
            overlap=5,
            chunk_size=2000,
            metadata_cols=["unit", "cycle", "health_status"],
            keep_metadata=True
        )
    """

    # ----------------------------------------------------------------------
    # 1. Prepare output HDF5 files for train and test windows
    # ----------------------------------------------------------------------
    train_file = output_file + "_train_windows.h5"
    test_file = output_file + "_test_windows.h5"
    h5_train = h5py.File(train_file, "w")
    h5_test = h5py.File(test_file, "w")

    X_train_dset = None
    y_train_dset = None
    M_train_dset = None  # for metadata if keep_metadata=True
    X_test_dset = None
    y_test_dset = None
    M_test_dset = None   # for metadata if keep_metadata=True

    train_count = 0  # number of train windows appended so far
    test_count = 0   # number of test windows appended so far

    # Step size for time windows (controls overlap)
    step_size = window_size - overlap

    # ----------------------------------------------------------------------
    # 2. Time-window generation function
    # ----------------------------------------------------------------------
    def create_time_windows(X_sub, y_sub=None):
        """
        Generates overlapping time windows from X_sub (and optionally from y_sub).

        - X_sub: shape (n_samples, num_features)
        - y_sub: shape (n_samples,) or None

        Returns:
            Xw: shape (num_windows, window_size, num_features)
            yw: shape (num_windows,) if y_sub is given; otherwise None
        """
        windows_list = []
        labels_list = []

        n_samples = X_sub.shape[0]
        # Generate windows with the specified step size
        for start_idx in range(0, n_samples - window_size + 1, step_size):
            end_idx = start_idx + window_size
            windows_list.append(X_sub[start_idx:end_idx])
            if y_sub is not None:
                # Common RUL approach: label from the last row of the window
                labels_list.append(y_sub[end_idx - 1])

        if len(windows_list) == 0:
            return np.array([]), (np.array([]) if y_sub is not None else None)

        # Stack them into final arrays
        Xw = np.stack(windows_list, axis=0)  # shape: (num_windows, window_size, num_features)
        if y_sub is not None:
            yw = np.array(labels_list, dtype=Xw.dtype)  # shape: (num_windows,)
            return Xw, yw
        else:
            return Xw, None

    if metadata_cols is None:
        metadata_cols = []

    # ----------------------------------------------------------------------
    # 3. Process each file in 'files'
    # ----------------------------------------------------------------------
    for i, filename in enumerate(files):
        print(f"Processing {filename} ({i+1}/{len(files)})...")

        # This function should return:
        # { "data": (X_train, X_test, y_train, y_test), "columns": [...] }
        data_dict = extract_data(filename, get_cols=True)
        X_train_raw, X_test_raw, y_train_raw, y_test_raw = data_dict["data"]
        y_train_raw = y_train_raw.squeeze()
        y_test_raw = y_test_raw.squeeze()
        columns = data_dict["columns"]
        print("Columns:", columns)

        # ------------------------------------------------------------------
        # 3a. Create the output datasets on the FIRST iteration only
        # ------------------------------------------------------------------
        if i == 0:
            # Identify which columns go in X, which go in metadata
            id_idxs = [columns.index(col) for col in id_cols]
            target_idx = columns.index("RUL") if "RUL" in columns else None
            meta_idxs = [columns.index(col) for col in metadata_cols if col in columns]

            # The X feature set excludes id_cols, target_col, and metadata_cols
            feature_cols = [
                c for c in range(len(columns))
                if (c not in id_idxs) and (c != target_idx) and (c not in meta_idxs)
            ]
            print([columns[i] for i in feature_cols])
            num_features = len(feature_cols)
            num_meta = len(meta_idxs) if keep_metadata else 0

            # Create placeholders for train/test window sets: X, y (always),
            # plus meta (only if keep_metadata is True).
            X_train_dset = h5_train.create_dataset(
                "X", 
                shape=(0, window_size, num_features),
                maxshape=(None, window_size, num_features),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size, window_size, num_features)
            )
            y_train_dset = h5_train.create_dataset(
                "y", 
                shape=(0,),
                maxshape=(None,),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size,)
            )
            if keep_metadata and num_meta > 0:
                M_train_dset = h5_train.create_dataset(
                    "meta",
                    shape=(0, window_size, num_meta),
                    maxshape=(None, window_size, num_meta),
                    dtype='float32',
                    compression="gzip",
                    chunks=(chunk_size, window_size, num_meta)
                )

            X_test_dset = h5_test.create_dataset(
                "X", 
                shape=(0, window_size, num_features),
                maxshape=(None, window_size, num_features),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size, window_size, num_features)
            )
            y_test_dset = h5_test.create_dataset(
                "y", 
                shape=(0,),
                maxshape=(None,),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size,)
            )
            if keep_metadata and num_meta > 0:
                M_test_dset = h5_test.create_dataset(
                    "meta",
                    shape=(0, window_size, num_meta),
                    maxshape=(None, window_size, num_meta),
                    dtype='float32',
                    compression="gzip",
                    chunks=(chunk_size, window_size, num_meta)
                )

            # Store feature/metadata columns as attributes for reference
            kept_feature_colnames = [columns[c] for c in feature_cols]
            h5_train.attrs["feature_columns"] = np.array(kept_feature_colnames, dtype="S")
            h5_test.attrs["feature_columns"] = np.array(kept_feature_colnames, dtype="S")

            if keep_metadata and num_meta > 0:
                kept_meta_colnames = [columns[c] for c in meta_idxs]
                h5_train.attrs["metadata_columns"] = np.array(kept_meta_colnames, dtype="S")
                h5_test.attrs["metadata_columns"] = np.array(kept_meta_colnames, dtype="S")
            else:
                # Store empty for clarity
                h5_train.attrs["metadata_columns"] = np.array([], dtype="S")
                h5_test.attrs["metadata_columns"] = np.array([], dtype="S")

        # ----------------------------------------------------------------------
        # 3b. Build windows for the TRAIN set
        # ----------------------------------------------------------------------
        if X_train_raw.shape[0] > 0:
            df_train = pd.DataFrame(X_train_raw, columns=columns)
            df_train["RUL"] = y_train_raw

            # Group by the ID columns to ensure no window spans multiple IDs
            group_df_train = df_train.groupby(id_cols, sort=False)
            for _, sub_df in group_df_train:
                # Create X windows
                sub_X = sub_df.iloc[:, feature_cols].values  # shape (n, num_features)
                sub_y = sub_df["RUL"].values
                Xw, yw = create_time_windows(sub_X, sub_y)

                if Xw.shape[0] == 0:
                    continue

                # If we're keeping metadata, build meta windows
                if keep_metadata and num_meta > 0:
                    sub_M = sub_df.iloc[:, meta_idxs].values
                    Mw, _ = create_time_windows(sub_M)
                else:
                    Mw = np.array([])

                # Append to HDF5
                n_new = Xw.shape[0]
                # Expand X
                X_train_dset.resize(X_train_dset.shape[0] + n_new, axis=0)
                X_train_dset[-n_new:] = Xw

                # Expand y
                y_train_dset.resize(y_train_dset.shape[0] + n_new, axis=0)
                y_train_dset[-n_new:] = yw

                # Expand meta (if we have it)
                if keep_metadata and num_meta > 0 and Mw.size > 0:
                    M_train_dset.resize(M_train_dset.shape[0] + n_new, axis=0)
                    M_train_dset[-n_new:] = Mw

                train_count += n_new

        # ----------------------------------------------------------------------
        # 3c. Build windows for the TEST set
        # ----------------------------------------------------------------------
        if X_test_raw.shape[0] > 0:
            df_test = pd.DataFrame(X_test_raw, columns=columns)
            df_test["RUL"] = y_test_raw

            group_df_test = df_test.groupby(id_cols, sort=False)
            for _, sub_df in group_df_test:
                sub_X = sub_df.iloc[:, feature_cols].values
                sub_y = sub_df["RUL"].values
                Xw, yw = create_time_windows(sub_X, sub_y)

                if Xw.shape[0] == 0:
                    continue

                if keep_metadata and num_meta > 0:
                    sub_M = sub_df.iloc[:, meta_idxs].values
                    Mw, _ = create_time_windows(sub_M)
                else:
                    Mw = np.array([])

                # Append to HDF5
                n_new = Xw.shape[0]
                # Expand X
                X_test_dset.resize(X_test_dset.shape[0] + n_new, axis=0)
                X_test_dset[-n_new:] = Xw

                # Expand y
                y_test_dset.resize(y_test_dset.shape[0] + n_new, axis=0)
                y_test_dset[-n_new:] = yw

                # Expand meta (if we have it)
                if keep_metadata and num_meta > 0 and Mw.size > 0:
                    M_test_dset.resize(M_test_dset.shape[0] + n_new, axis=0)
                    M_test_dset[-n_new:] = Mw

                test_count += n_new

        print(f"  ...done. Train windows so far: {train_count}, Test windows so far: {test_count}")

    # ----------------------------------------------------------------------
    # 4. Cleanup and Close
    # ----------------------------------------------------------------------
    h5_train.close()
    h5_test.close()
    print(f"✅ Train windows file saved: {train_file}  (total windows: {train_count})")
    print(f"✅ Test windows file saved: {test_file}   (total windows: {test_count})")


In [5]:
merge_files_timewindows(
    files=files,
    output_file="engine",         # -> produces "merged_engine_train_windows.h5" and "merged_engine_test_windows.h5"
    id_cols=["unit", "cycle"],
    metadata_cols=["unit", "hs"],
    window_size=50,
    overlap=5,                           # windows will have step_size=45
    chunk_size=1000,
    keep_metadata=False # chunk shape for output dataset
)

Processing ../../data/N-CMAPSS_DS01-005.h5 (1/9)...
Columns: ['alt', 'Mach', 'TRA', 'T2', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf', 'T40', 'P30', 'P45', 'W21', 'W22', 'W25', 'W31', 'W32', 'W48', 'W50', 'SmFan', 'SmLPC', 'SmHPC', 'phi', 'fan_eff_mod', 'fan_flow_mod', 'LPC_eff_mod', 'LPC_flow_mod', 'HPC_eff_mod', 'HPC_flow_mod', 'HPT_eff_mod', 'HPT_flow_mod', 'LPT_eff_mod', 'LPT_flow_mod', 'unit', 'cycle', 'Fc', 'hs']
['alt', 'Mach', 'TRA', 'T2', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf', 'T40', 'P30', 'P45', 'W21', 'W22', 'W25', 'W31', 'W32', 'W48', 'W50', 'SmFan', 'SmLPC', 'SmHPC', 'phi', 'fan_eff_mod', 'fan_flow_mod', 'LPC_eff_mod', 'LPC_flow_mod', 'HPC_eff_mod', 'HPC_flow_mod', 'HPT_eff_mod', 'HPT_flow_mod', 'LPT_eff_mod', 'LPT_flow_mod', 'Fc']


KeyboardInterrupt: 

In [4]:
def random_sample_hdf5(
    input_file,
    output_file,
    sample_size=500000,
    seed=None,
    dataset_chunk=(1000, None, None),  # (rows, window_size, num_features)
    compression_opts=4
):
    """
    Creates a new HDF5 file as a random subsample of the original
    "X", "y", and "meta" datasets, with configurable chunk shapes and compression.
    """
    import numpy as np
    import h5py

    if seed is not None:
        np.random.seed(seed)

    with h5py.File(input_file, "r") as f_in:
        X_in = f_in["X"]
        num_windows, window_size, num_features = X_in.shape

        # Adjust sample size
        sample_size = min(sample_size, num_windows)

        # Create random indices
        all_indices = np.arange(num_windows)
        np.random.shuffle(all_indices)
        chosen_indices = np.sort(all_indices[:sample_size])

        # Prepare optional data
        y_in = f_in["y"] if "y" in f_in else None
        meta_in = f_in["meta"] if "meta" in f_in else None

        # --- Create output file ---
        with h5py.File(output_file, "w") as f_out:
            # Copy file-level attributes
            for attr_key, attr_val in f_in.attrs.items():
                f_out.attrs[attr_key] = attr_val

            # Setup chunk shape
            # e.g. (1000, window_size, num_features)
            # If dataset_chunk is provided as e.g. (1000, None, None),
            # we insert the actual window_size, num_features.
            row_chunk = dataset_chunk[0] or 10000
            win_chunk = dataset_chunk[1] or window_size
            feat_chunk = dataset_chunk[2] or num_features

            # Create X dataset
            X_out = f_out.create_dataset(
                "X",
                shape=(sample_size, window_size, num_features),
                dtype=X_in.dtype,
                #compression="gzip",            # or None if no compression
                #compression_opts=compression_opts,
                compression=None,
                chunks=(row_chunk, win_chunk, feat_chunk)
            )
            for ak, av in X_in.attrs.items():
                X_out.attrs[ak] = av

            # Create y dataset if present
            if y_in is not None:
                y_out = f_out.create_dataset(
                    "y",
                    shape=(sample_size,),
                    dtype=y_in.dtype,
                    compression=None,
                    #compression="gzip",
                    #compression_opts=compression_opts,
                    chunks=(row_chunk,)
                )
                for ak, av in y_in.attrs.items():
                    y_out.attrs[ak] = av
            else:
                y_out = None

            # Create meta dataset if present
            if meta_in is not None:
                meta_out = f_out.create_dataset(
                    "meta",
                    shape=(sample_size,) + meta_in.shape[1:],
                    dtype=meta_in.dtype,
                    #compression="gzip",
                    #compression_opts=compression_opts,
                    compression=None,
                    chunks=(row_chunk,) + meta_in.shape[1:]
                )
                for ak, av in meta_in.attrs.items():
                    meta_out.attrs[ak] = av
            else:
                meta_out = None

            # --- Write in small batches ---
            read_chunk_size = 20000
            start = 0
            while start < sample_size:
                end = min(start + read_chunk_size, sample_size)
                idx_chunk = chosen_indices[start:end]

                X_chunk = X_in[idx_chunk]
                X_out[start:end] = X_chunk

                if y_out is not None:
                    y_chunk = y_in[idx_chunk]
                    y_out[start:end] = y_chunk

                if meta_out is not None:
                    meta_chunk = meta_in[idx_chunk]
                    meta_out[start:end] = meta_chunk

                start = end

        print(f"Sampled {sample_size} rows out of {num_windows} total.")
        print(f"✅ Random sample saved to {output_file}")

In [8]:
random_sample_hdf5(
    input_file="../../engine_train_windows.h5",
    output_file="../../engine_train_windows_sample.h5",
    sample_size=500000,
    seed=42,
    compression_opts=1
)

Sampled 500000 rows out of 980793 total.
✅ Random sample saved to ../../engine_train_windows_sample.h5


In [9]:
import numpy as np
import h5py

def min_max_scale_hdf5(
    input_file,
    output_file,
    chunk_size=1000
):
    """
    Reads an HDF5 file with datasets "X", "y", and optionally "meta",
    computes per-feature min/max for X, then writes out a new file
    where X is min-max scaled to [0,1]. Other datasets (y, meta) are copied.

    The feature-wise min/max arrays are stored in the output file attributes
    as "X_min" and "X_max".

    Parameters:
        input_file  (str): Path to the existing HDF5 file.
        output_file (str): Path to the new HDF5 file to create.
        chunk_size  (int): Number of samples (windows) to process at a time
                           when computing min and max, and when writing data.
    """

    with h5py.File(input_file, "r") as f_in:
        X_in = f_in["X"]
        num_windows, window_size, num_features = X_in.shape

        # Check for optional datasets
        y_in = f_in["y"] if "y" in f_in else None
        meta_in = f_in["meta"] if "meta" in f_in else None

        # --- 1) Compute global min and max across the entire "X" ---
        # Initialize with opposite infinities
        X_min = np.full((num_features,), np.inf, dtype=np.float32)
        X_max = np.full((num_features,), -np.inf, dtype=np.float32)

        # Pass A: find min and max feature-wise
        for start_idx in range(0, num_windows, chunk_size):
            end_idx = min(start_idx + chunk_size, num_windows)
            chunk = X_in[start_idx:end_idx]  # shape: (chunk_size, window_size, num_features)
            # Flatten (chunk_size*window_size) x num_features
            chunk_2d = chunk.reshape(-1, num_features)
            chunk_min = chunk_2d.min(axis=0)
            chunk_max = chunk_2d.max(axis=0)

            X_min = np.minimum(X_min, chunk_min)
            X_max = np.maximum(X_max, chunk_max)

        # Protect against zero range in scaling (when max == min)
        # We'll do an epsilon-based approach so no divide-by-zero.
        epsilon = 1e-12
        X_range = np.maximum(X_max - X_min, epsilon)

        # --- 2) Create output file and datasets ---
        with h5py.File(output_file, "w") as f_out:
            # Copy the attributes from the input file if you want
            for attr_key, attr_val in f_in.attrs.items():
                f_out.attrs[attr_key] = attr_val

            # Also store the computed min/max as attributes
            f_out.attrs["X_min"] = X_min
            f_out.attrs["X_max"] = X_max

            # Create scaled X dataset
            X_out = f_out.create_dataset(
                "X",
                shape=(num_windows, window_size, num_features),
                dtype=np.float32,
                compression="gzip",
                chunks=(chunk_size, window_size, num_features)
            )

            # If "y" exists, copy it
            if y_in is not None:
                y_out = f_out.create_dataset(
                    "y",
                    data=y_in[:],
                    dtype=y_in.dtype,
                    compression="gzip",
                    chunks=True
                )
                # Optionally replicate y_in.attrs if any
                for ak, av in y_in.attrs.items():
                    y_out.attrs[ak] = av

            # If "meta" exists, copy it
            if meta_in is not None:
                meta_out = f_out.create_dataset(
                    "meta",
                    data=meta_in[:],
                    dtype=meta_in.dtype,
                    compression="gzip",
                    chunks=True
                )
                # Optionally replicate meta_in.attrs if any
                for ak, av in meta_in.attrs.items():
                    meta_out.attrs[ak] = av

            # Copy dataset-level attributes for "X" (like feature_columns)
            for ak, av in X_in.attrs.items():
                X_out.attrs[ak] = av

            # --- 3) Pass B: read + scale + write ---
            for start_idx in range(0, num_windows, chunk_size):
                end_idx = min(start_idx + chunk_size, num_windows)
                chunk = X_in[start_idx:end_idx]
                # chunk: shape (chunk_size, window_size, num_features)

                # Scale feature-wise:
                # scaled = (value - X_min) / (X_max - X_min)
                # We'll do broadcast: (chunk - X_min) / X_range
                scaled = (chunk - X_min) / X_range
                X_out[start_idx:end_idx] = scaled

        print(f"✅ Finished min-max scaling. Output saved to {output_file}")


In [10]:
min_max_scale_hdf5(
    input_file="../../engine_train_windows_sample.h5",
    output_file="../../engine_windows_sample_scaled.h5",
    chunk_size=20000
)

✅ Finished min-max scaling. Output saved to ../../engine_windows_sample_scaled.h5
