In [3]:
import os
import h5py
import time
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib import gridspec
%matplotlib inline

In [8]:
files = ["../../N-CMAPSS_DS01-005.h5", "../N-CMAPSS_DS02-006.h5", "../N-CMAPSS_DS03-012.h5", "../N-CMAPSS_DS04.h5", "../N-CMAPSS_DS05.h5", "../N-CMAPSS_DS06.h5", "../N-CMAPSS_DS07.h5", "../N-CMAPSS_DS08a-009.h5", "../N-CMAPSS_DS08c-008.h5"]

In [5]:
def extract_columns(filename):
    with h5py.File(filename, 'r') as infile:
        # get columns from file
        W_var = np.array(infile.get('W_var'))
        X_s_var = np.array(infile.get('X_s_var'))  
        X_v_var = np.array(infile.get('X_v_var')) 
        T_var = np.array(infile.get('T_var'))
        A_var = np.array(infile.get('A_var'))

    # from np.array to list dtype U4/U5
    W_var = list(np.array(W_var, dtype='U20'))
    X_s_var = list(np.array(X_s_var, dtype='U20'))  
    X_v_var = list(np.array(X_v_var, dtype='U20')) 
    T_var = list(np.array(T_var, dtype='U20'))
    A_var = list(np.array(A_var, dtype='U20'))

    return W_var + X_s_var + X_v_var + T_var + A_var 

def extract_data(filename, get_cols=False):
    with h5py.File(filename, 'r') as infile:
        # train data
        W_dev = np.array(infile.get('W_dev'))          
        X_s_dev = np.array(infile.get('X_s_dev'))       
        X_v_dev = np.array(infile.get('X_v_dev'))      
        T_dev = np.array(infile.get('T_dev'))       
        y_train = np.array(infile.get('Y_dev'))           
        A_dev = np.array(infile.get('A_dev')) 
        x_train = np.concatenate((W_dev, X_s_dev, X_v_dev, T_dev, A_dev), axis=1)

        # test data
        W_test = np.array(infile.get('W_test'))        
        X_s_test = np.array(infile.get('X_s_test'))     
        X_v_test = np.array(infile.get('X_v_test'))      
        T_test = np.array(infile.get('T_test'))         
        y_test = np.array(infile.get('Y_test'))         
        A_test = np.array(infile.get('A_test')) 
        x_test = np.concatenate((W_test, X_s_test, X_v_test, T_test, A_test), axis=1)

    # concat data together for EDA
    #X = np.concatenate((x_train, x_test), axis=0)
    #y = np.concatenate((y_train, y_test), axis=0)
    
    # possibly extract columns
    cols = None
    if get_cols:
        cols = extract_columns(filename)

    return {"data": (x_train, x_test, y_train, y_test), "columns": cols}

def read_data(files):
    data = extract_data(files[0], get_cols=True)
    columns = data["columns"]
    X, y = data["data"]
    for filename in files[1:]:
        X_temp, y_temp = extract_data(filename)["data"]
        X, y = np.concatenate((X, X_temp), axis=0), np.concatenate((y, y_temp), axis=0)
    return X, y, columns

In [4]:
def merge_files(files, output_file, id_cols, chunk_size=10000):
    train_file = output_file + "_train.h5"
    test_file = output_file + "_test.h5"
    h5_train = h5py.File(train_file, "w")
    h5_test = h5py.File(test_file, "w")

    train_size, test_size = 0, 0

    for i, filename in enumerate(files):
        print(f"Processing {filename} ({i+1}/{len(files)})...")

        data = extract_data(filename, get_cols=i==0)  # Read one file at a time
        X_train, X_test, y_train, y_test = data["data"]
        y_train, y_test = y_train.squeeze(), y_test.squeeze()

        if i == 0:
            columns = data["columns"]
            id_idxs = [columns.index(col) for col in id_cols]
            # make starting datasets
            h5_train.create_dataset("X", shape=(0, X_train.shape[1]), maxshape=(None, X_train.shape[1]), dtype='float32', compression="gzip", chunks=(chunk_size, X_train.shape[1]))
            h5_train.create_dataset("y", shape=(0,), maxshape=(None,), dtype='float32', compression="gzip", chunks=(chunk_size,))

            h5_test.create_dataset("X", shape=(0, X_test.shape[1]), maxshape=(None, X_test.shape[1]), dtype='float32', compression="gzip", chunks=(chunk_size, X_test.shape[1]))
            h5_test.create_dataset("y", shape=(0,), maxshape=(None,), dtype='float32', compression="gzip", chunks=(chunk_size,))

            h5_train.attrs["columns"] = np.array(columns, dtype="S")
            h5_test.attrs["columns"] = np.array(columns, dtype="S")



        # append data incrementally to avoid memory issues
        # fix shapes
        h5_train["X"].resize((train_size + X_train.shape[0]), axis=0)
        h5_train["X"][train_size:] = X_train
        h5_train["y"].resize((train_size + y_train.shape[0]), axis=0)
        h5_train["y"][train_size:] = y_train

        h5_test["X"].resize((test_size + X_test.shape[0]), axis=0)
        h5_test["X"][test_size:] = X_test
        h5_test["y"].resize((test_size + y_test.shape[0]), axis=0)
        h5_test["y"][test_size:] = y_test

        train_size += X_train.shape[0]
        test_size += X_test.shape[0]

    # cleanup
    h5_train.close()
    h5_test.close()


In [5]:
merge_files(files, "engine_data")

Processing N-CMAPSS_DS01-005.h5 (1/9)...
Processing N-CMAPSS_DS02-006.h5 (2/9)...
Processing N-CMAPSS_DS03-012.h5 (3/9)...
Processing N-CMAPSS_DS04.h5 (4/9)...
Processing N-CMAPSS_DS05.h5 (5/9)...
Processing N-CMAPSS_DS06.h5 (6/9)...
Processing N-CMAPSS_DS07.h5 (7/9)...
Processing N-CMAPSS_DS08a-009.h5 (8/9)...
Processing N-CMAPSS_DS08c-008.h5 (9/9)...


In [6]:
def merge_files(files, output_file, id_cols, chunk_size=10000):
    """
    Merges multiple HDF5 files into a single train/test HDF5 dataset and creates an index.csv file.

    Parameters:
        files (list): List of HDF5 file paths to merge.
        output_file (str): Output HDF5 file path (without extension).
        id_cols (list): List of column names identifying a time series (e.g., ["unit", "cycle"]).
        chunk_size (int): Number of rows to process in each batch (to prevent memory issues).
    """
    train_file = output_file + "_train.h5"
    test_file = output_file + "_test.h5"
    h5_train = h5py.File(train_file, "w")
    h5_test = h5py.File(test_file, "w")

    train_size, test_size = 0, 0
    index_data = []  # Store (unit, cycle, start, stop, dataset_type)

    for i, filename in enumerate(files):
        print(f"Processing {filename} ({i+1}/{len(files)})...")

        data = extract_data(filename, get_cols=(i == 0))  # Read one file at a time
        X_train, X_test, y_train, y_test = data["data"]
        y_train, y_test = y_train.squeeze(), y_test.squeeze()

        if i == 0:
            columns = data["columns"]

            # Create datasets
            h5_train.create_dataset("X", shape=(0, X_train.shape[1]), maxshape=(None, X_train.shape[1]), 
                                    dtype='float32', compression="gzip", chunks=(chunk_size, X_train.shape[1]))
            h5_train.create_dataset("y", shape=(0,), maxshape=(None,), dtype='float32', compression="gzip", 
                                    chunks=(chunk_size,))

            h5_test.create_dataset("X", shape=(0, X_test.shape[1]), maxshape=(None, X_test.shape[1]), 
                                   dtype='float32', compression="gzip", chunks=(chunk_size, X_test.shape[1]))
            h5_test.create_dataset("y", shape=(0,), maxshape=(None,), dtype='float32', compression="gzip", 
                                   chunks=(chunk_size,))

            h5_train.attrs["columns"] = np.array(columns, dtype="S")
            h5_test.attrs["columns"] = np.array(columns, dtype="S")

        # ** Extract indices of ID columns **
        id_idxs = [columns.index(col) for col in id_cols]

        # ** Extract unique (unit, cycle) time series for train and test sets **
        for dataset_type, X_data, base_index in [("train", X_train, train_size), ("test", X_test, test_size)]:
            if X_data.shape[0] == 0:
                continue  # Skip if there's no data

            # Load only unit and cycle columns
            id_values = X_data[:, id_idxs]
            id_df = pd.DataFrame(id_values, columns=id_cols)

            # Get start and stop indices for each unique (unit, cycle)
            grouped = id_df.groupby(id_cols).apply(lambda df: (df.index.min(), df.index.max()))
            for (unit, cycle), (start, stop) in grouped.items():
                index_data.append([unit, cycle, start + base_index, stop + base_index, dataset_type])

        # ** Append data incrementally to avoid memory issues **
        h5_train["X"].resize((train_size + X_train.shape[0]), axis=0)
        h5_train["X"][train_size:] = X_train
        h5_train["y"].resize((train_size + y_train.shape[0]), axis=0)
        h5_train["y"][train_size:] = y_train

        h5_test["X"].resize((test_size + X_test.shape[0]), axis=0)
        h5_test["X"][test_size:] = X_test
        h5_test["y"].resize((test_size + y_test.shape[0]), axis=0)
        h5_test["y"][test_size:] = y_test

        train_size += X_train.shape[0]
        test_size += X_test.shape[0]

    # ** Save the index file as CSV **
    index_df = pd.DataFrame(index_data, columns=["unit", "cycle", "start_idx", "stop_idx", "dataset"])
    index_df.to_csv(output_file + "_index.csv", index=False)
    print(f"✅ Index file saved: {output_file}_index.csv")

    # Cleanup
    h5_train.close()
    h5_test.close()

In [7]:
merge_files(files, "engine", id_cols=["unit", "cycle"])

Processing N-CMAPSS_DS01-005.h5 (1/9)...
Processing N-CMAPSS_DS02-006.h5 (2/9)...
Processing N-CMAPSS_DS03-012.h5 (3/9)...
Processing N-CMAPSS_DS04.h5 (4/9)...
Processing N-CMAPSS_DS05.h5 (5/9)...
Processing N-CMAPSS_DS06.h5 (6/9)...
Processing N-CMAPSS_DS07.h5 (7/9)...
Processing N-CMAPSS_DS08a-009.h5 (8/9)...
Processing N-CMAPSS_DS08c-008.h5 (9/9)...
✅ Index file saved: engine_index.csv


In [6]:
import numpy as np
import pandas as pd
import h5py

def merge_files_timewindows(
    files,
    output_file,
    id_cols,
    window_size=50,
    overlap=10,
    chunk_size=10000
):
    """
    Merges multiple HDF5 files into train/test windowed HDF5 datasets.
    The output files contain shape (num_windows, window_size, num_features).

    Parameters:
        files (list of str): List of HDF5 file paths to merge.
        output_file (str): Basename for output HDF5 files (no ".h5" extension).
                           This will produce "{output_file}_train_windows.h5" and
                           "{output_file}_test_windows.h5".
        id_cols (list of str): Column names identifying each time series, e.g. ["unit", "cycle"].
        window_size (int): Size of each time window (# of observations).
        overlap (int): Number of overlapping rows between consecutive windows.
        chunk_size (int): Used when creating chunked HDF5 datasets. Does NOT
                          control reading chunk size from the input files,
                          but sets chunk shape in the output HDF5.
    """
    # ----------------------------------------------------------------------
    # 1. Prepare output HDF5 files for train and test windows
    # ----------------------------------------------------------------------
    train_file = output_file + "_train_windows.h5"
    test_file = output_file + "_test_windows.h5"
    h5_train = h5py.File(train_file, "w")
    h5_test = h5py.File(test_file, "w")

    # We'll create empty placeholders; we don't know the final # of windows yet.
    X_train_dset = None
    y_train_dset = None
    X_test_dset = None
    y_test_dset = None

    train_count = 0  # How many train windows we've appended so far
    test_count = 0   # How many test windows we've appended so far

    # Step size for time windows (controls overlap)
    step_size = window_size - overlap

    # ----------------------------------------------------------------------
    # 2. Helper function for time-window generation on a single ID group
    # ----------------------------------------------------------------------
    def create_time_windows(X_sub, y_sub):
        """
        Given the X and y arrays for a single ID group (time-series),
        generate all windows with shape (num_windows, window_size, num_features).
        For the label, we take y at the *end* of each window (common RUL approach).
        """
        windows_list = []
        labels_list = []

        n_samples = X_sub.shape[0]
        # Go up to n_samples - window_size + 1
        for start_idx in range(0, n_samples - window_size + 1, step_size):
            end_idx = start_idx + window_size
            windows_list.append(X_sub[start_idx:end_idx])
            # Label often taken from the last row in the window
            labels_list.append(y_sub[end_idx - 1])

        if len(windows_list) == 0:
            return np.array([]), np.array([])

        # Stack them into final arrays
        Xw = np.stack(windows_list, axis=0)  # shape: (num_windows, window_size, num_features)
        yw = np.array(labels_list, dtype=Xw.dtype)  # shape: (num_windows,)
        return Xw, yw

    # ----------------------------------------------------------------------
    # 3. Process each file in 'files'
    #    We assume you have a function `extract_data(h5_path, get_cols=False)`
    #    that returns { "data": (X_train, X_test, y_train, y_test), "columns": [...] }
    # ----------------------------------------------------------------------
    for i, filename in enumerate(files):
        print(f"Processing {filename} ({i+1}/{len(files)})...")

        # ----------------------------------------------------------------------
        # 3a. Extract the data from the source file
        # ----------------------------------------------------------------------
        # You need to implement or already have `extract_data`; it typically does:
        #   X_train, X_test, y_train, y_test = ...
        #   columns = [...]
        #   return { "data": (X_train, X_test, y_train, y_test), "columns": columns }
        data_dict = extract_data(filename, get_cols=True)
        X_train_raw, X_test_raw, y_train_raw, y_test_raw = data_dict["data"]
        y_train_raw = y_train_raw.squeeze()
        y_test_raw = y_test_raw.squeeze()
        columns = data_dict["columns"]
        print(columns)

        # If this is the first file, create the output datasets
        if i == 0:
            # We remove the ID columns + target from the "feature" set
            id_idxs = [columns.index(col) for col in id_cols]
            target_idx = columns.index("RUL") if "RUL" in columns else None

            feature_cols = [
                c for c in range(len(columns))
                if (c not in id_idxs) and (c != target_idx)
            ]
            num_features = len(feature_cols)

            # Create placeholders for train/test window sets
            X_train_dset = h5_train.create_dataset(
                "X", shape=(0, window_size, num_features),
                maxshape=(None, window_size, num_features),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size, window_size, num_features)
            )
            y_train_dset = h5_train.create_dataset(
                "y", shape=(0,),
                maxshape=(None,),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size,)
            )

            X_test_dset = h5_test.create_dataset(
                "X", shape=(0, window_size, num_features),
                maxshape=(None, window_size, num_features),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size, window_size, num_features)
            )
            y_test_dset = h5_test.create_dataset(
                "y", shape=(0,),
                maxshape=(None,),
                dtype='float32',
                compression="gzip",
                chunks=(chunk_size,)
            )

            # Store attribute with feature column names if you want
            # (excluding ID + target columns)
            kept_column_names = [columns[c] for c in feature_cols]
            h5_train.attrs["columns"] = np.array(kept_column_names, dtype="S")
            h5_test.attrs["columns"] = np.array(kept_column_names, dtype="S")

        # ----------------------------------------------------------------------
        # 3b. Group the train set by ID columns, generate windows, append them
        # ----------------------------------------------------------------------
        if X_train_raw.shape[0] > 0:
            # Convert to DataFrame for grouping
            df_train = pd.DataFrame(X_train_raw, columns=columns)
            df_train["RUL"] = y_train_raw

            # For each unique ID, build windows
            group_df_train = df_train.groupby(id_cols, sort=False)
            for _, sub_df in group_df_train:
                # Separate features vs. label
                # - We'll keep only the relevant feature columns
                # - We'll keep label from the 'RUL' column
                sub_X = sub_df.iloc[:, feature_cols].values  # shape (n, num_features)
                sub_y = sub_df["RUL"].values

                # Generate windows
                Xw, yw = create_time_windows(sub_X, sub_y)
                if Xw.shape[0] == 0:
                    continue

                # Append to HDF5
                n_new = Xw.shape[0]
                X_train_dset.resize(X_train_dset.shape[0] + n_new, axis=0)
                y_train_dset.resize(y_train_dset.shape[0] + n_new, axis=0)

                X_train_dset[-n_new:] = Xw
                y_train_dset[-n_new:] = yw
                train_count += n_new

        # ----------------------------------------------------------------------
        # 3c. Same logic for test set
        # ----------------------------------------------------------------------
        if X_test_raw.shape[0] > 0:
            # Convert to DataFrame for grouping
            df_test = pd.DataFrame(X_test_raw, columns=columns)
            df_test["RUL"] = y_test_raw

            group_df_test = df_test.groupby(id_cols, sort=False)
            for _, sub_df in group_df_test:
                sub_X = sub_df.iloc[:, feature_cols].values
                sub_y = sub_df["RUL"].values

                Xw, yw = create_time_windows(sub_X, sub_y)
                if Xw.shape[0] == 0:
                    continue

                n_new = Xw.shape[0]
                X_test_dset.resize(X_test_dset.shape[0] + n_new, axis=0)
                y_test_dset.resize(y_test_dset.shape[0] + n_new, axis=0)

                X_test_dset[-n_new:] = Xw
                y_test_dset[-n_new:] = yw
                test_count += n_new

        print(f"  ...done. Train windows so far: {train_count}, Test windows so far: {test_count}")

    # ----------------------------------------------------------------------
    # 4. Cleanup and Close
    # ----------------------------------------------------------------------
    h5_train.close()
    h5_test.close()
    print(f"✅ Train windows file saved: {train_file}  (total windows: {train_count})")
    print(f"✅ Test windows file saved: {test_file}   (total windows: {test_count})")


In [9]:
merge_files_timewindows(
    files=files,
    output_file="engine",         # -> produces "merged_engine_train_windows.h5" and "merged_engine_test_windows.h5"
    id_cols=["unit", "cycle"],
    window_size=50,
    overlap=5,                           # windows will have step_size=45
    chunk_size=1000                      # chunk shape for output dataset
)

OSError: Unable to create file (unable to truncate a file which is already open)

In [8]:
def load_from_hdf5(filename="data.h5"):
    with h5py.File(filename, "r") as f:
        X = f["X"][:]
        y = f["y"][:]
    return X, y

In [9]:
X, y = load_from_hdf5(filename="engine_train_windows.h5")

In [10]:
X.shape

(980793, 50, 44)