In [23]:
import random
import h5py
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [24]:
# Set-up - Define file location
filename = "E:\\Turbofan Engine Degradation Simulation Data\\N-CMAPSS_DS03-012.h5"
h5_file = h5py.File(filename, 'r')

In [None]:
# Time tracking, Operation time (min):  0.003
t = time.process_time()  

# Load data
with h5py.File(filename, 'r') as hdf:
        # Development set
        W_dev = np.array(hdf.get('W_dev'))             # W
        X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
        X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
        T_dev = np.array(hdf.get('T_dev'))             # T
        Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
        A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

        # Test set
        W_test = np.array(hdf.get('W_test'))           # W
        X_s_test = np.array(hdf.get('X_s_test'))       # X_s
        X_v_test = np.array(hdf.get('X_v_test'))       # X_v
        T_test = np.array(hdf.get('T_test'))           # T
        Y_test = np.array(hdf.get('Y_test'))           # RUL  
        A_test = np.array(hdf.get('A_test'))           # Auxiliary
        
        # Varnams
        W_var = np.array(hdf.get('W_var'))
        X_s_var = np.array(hdf.get('X_s_var'))  
        X_v_var = np.array(hdf.get('X_v_var')) 
        T_var = np.array(hdf.get('T_var'))
        A_var = np.array(hdf.get('A_var'))
        
        # from np.array to list dtype U4/U5
        W_var = list(np.array(W_var, dtype='U20'))
        X_s_var = list(np.array(X_s_var, dtype='U20'))  
        X_v_var = list(np.array(X_v_var, dtype='U20')) 
        T_var = list(np.array(T_var, dtype='U20'))
        A_var = list(np.array(A_var, dtype='U20'))
                          
W = np.concatenate((W_dev, W_test), axis=0)  
X_s = np.concatenate((X_s_dev, X_s_test), axis=0)
X_v = np.concatenate((X_v_dev, X_v_test), axis=0)
T = np.concatenate((T_dev, T_test), axis=0)
Y = np.concatenate((Y_dev, Y_test), axis=0) 
A = np.concatenate((A_dev, A_test), axis=0) 
    
print('')
print("Operation time (min): " , (time.process_time()-t)/60)
print('')
print ("W shape: " + str(W.shape))
print ("X_s shape: " + str(X_s.shape))
print ("X_v shape: " + str(X_v.shape))
print ("T shape: " + str(T.shape))
print ("A shape: " + str(A.shape))

In [4]:
# Convert W_var and X_s_var from lists to pandas DataFrames with appropriate column names
W_df = pd.DataFrame(W, columns=W_var)
X_s_df = pd.DataFrame(X_s, columns=X_s_var) 
A_df = pd.DataFrame(A, columns=A_var)
Y_df = pd.DataFrame(Y, columns=["RUL"])

# Concatenate project variables into one DataFrame
df = pd.concat([W_df, X_s_df, A_df, Y_df], axis=1)


In [5]:
train_df2 = df[df['unit'].isin([16, 18, 20, 11, 14, 15])] #from DS02-006 dataset

In [6]:
train_df1 = df[df['unit'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])] #from DS03-012 dataset
test_unit_13 = df[df['unit'] == 13]
test_unit_14 = df[df['unit'] == 14]
test_unit_15 = df[df['unit'] == 15]

In [10]:
# Combine train_df1 and train_df2
train_df = pd.concat([train_df1, train_df2], axis=0)

In [11]:
# Free up memory
del hdf, df, W_df, X_s_df, A_df, Y_df, W_var, X_s_var, A_var, W, X_s, A, Y, train_df1, train_df2

In [9]:
# Remove flight class and cycle column
train_df = train_df.drop(columns=['Fc', 'cycle'])
test_unit_13 = test_unit_13.drop(columns=['Fc', 'cycle'])
test_unit_14 = test_unit_14.drop(columns=['Fc', 'cycle'])
test_unit_15 = test_unit_15.drop(columns=['Fc', 'cycle'])


In [None]:
# Check shape
print(train_df.shape)
print(test_unit_13.shape)
print(test_unit_14.shape)
print(test_unit_15.shape)

In [14]:
def create_windows(df, window_size=30, step_size=10):
    """
    Groups the DataFrame into non-overlapping time windows with a specified step size.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - window_size (int): The number of rows per window.
    - step_size (int): The number of rows to skip between windows.

    Returns:
    - List[pd.DataFrame]: A list of DataFrame windows.
    """
    windows = []
    for i in range(0, len(df) - window_size + 1, step_size):
        window = df.iloc[i:i + window_size].reset_index(drop=True)
        windows.append(window)
    return windows

train_windows = create_windows(train_df)
test_unit_13_windows = create_windows(test_unit_13)
test_unit_14_windows = create_windows(test_unit_14)
test_unit_15_windows = create_windows(test_unit_15)

In [15]:
# Free up memory
del train_df, test_unit_13, test_unit_14, test_unit_15

In [None]:
def remove_non_homogeneous_windows(windows):
    """
    Removes windows where all units are not the same.

    Parameters:
    - windows (List[pd.DataFrame]): A list of DataFrame windows.

    Returns:
    - List[pd.DataFrame]: A list of DataFrame windows with homogeneous units.
    """
    homogeneous_windows = []
    for window in windows:
        if window['unit'].nunique() == 1: 
            homogeneous_windows.append(window)
    return homogeneous_windows

print(len(train_windows))
train_windows = remove_non_homogeneous_windows(train_windows)
print("After removing non-homogeneous windows: ", len(train_windows))

In [17]:
def remove_unit_column(windows):
    """
    Removes the 'unit' column from each window.

    Parameters:
    - windows (List[pd.DataFrame]): A list of DataFrame windows.

    Returns:
    - List[pd.DataFrame]: A list of DataFrame windows without the 'unit' column.
    """
    updated_windows = [window.drop(columns=['unit']) for window in windows]
    return updated_windows

train_windows = remove_unit_column(train_windows)
test_unit_13_windows = remove_unit_column(test_unit_13_windows)
test_unit_14_windows = remove_unit_column(test_unit_14_windows)
test_unit_15_windows = remove_unit_column(test_unit_15_windows)


In [18]:
def randomize_windows(windows):
    """
    Randomizes the order of windows.

    Parameters:
    - windows (List[pd.DataFrame]): A list of DataFrame windows.

    Returns:
    - List[pd.DataFrame]: A list of randomized DataFrame windows.
    """
    randomized_windows = windows[:]  
    random.shuffle(randomized_windows)  
    return randomized_windows

# Randomize training windows
train_windows = randomize_windows(train_windows)

In [11]:
# Split trainging windows into training and validation
train_windows, validation_windows = train_test_split(train_windows, test_size=0.1, random_state=42)


In [None]:
def separate_x_y(windows, label_columns=['RUL', 'hs']):
    """
    Separates the features (X) from the labels (y) in the windows.
    
    Parameters:
    - windows (List[pd.DataFrame]): A list of DataFrame windows.
    - label_columns (List[str]): The names of the label columns.
    
    Returns:
    - X (np.ndarray): Features.
    - y (np.ndarray): Labels with shape (num_windows, num_labels).
    """
    X = np.array([window.drop(columns=label_columns).values for window in windows])
    y = np.array([[window[label].iloc[0] for label in label_columns] for window in windows])
    return X, y

train_x, train_y = separate_x_y(train_windows)
validation_x, validation_y = separate_x_y(validation_windows)
test_unit_13_x, test_unit_13_y = separate_x_y(test_unit_13_windows)
test_unit_14_x, test_unit_14_y = separate_x_y(test_unit_14_windows)
test_unit_15_x, test_unit_15_y = separate_x_y(test_unit_15_windows)


In [21]:
# Free up memory
del train_windows, validation_windows, test_unit_13_windows, test_unit_14_windows, test_unit_15_windows

In [None]:
# Check data shape and y labels
print("Train x shape: ", train_x.shape)
print("Train y shape: ", train_y.shape)
print("Train y[0]: ", train_y[0])
print("Train y[1]: ", train_y[1])


In [23]:
with h5py.File('data\\train_df_30.h5', 'w') as hdf:
    hdf.create_dataset('x', data=train_x, compression='gzip', compression_opts=9)
    hdf.create_dataset('y', data=train_y, compression='gzip', compression_opts=9)
with h5py.File('data\\validation_df_30.h5', 'w') as hdf:
    hdf.create_dataset('x', data=validation_x, compression='gzip', compression_opts=9)
    hdf.create_dataset('y', data=validation_y, compression='gzip', compression_opts=9)
with h5py.File('data\\test_unit_13_30.h5', 'w') as hdf:
    hdf.create_dataset('x', data=test_unit_13_x, compression='gzip', compression_opts=9)
    hdf.create_dataset('y', data=test_unit_13_y, compression='gzip', compression_opts=9)
with h5py.File('data\\test_unit_14_30.h5', 'w') as hdf:
    hdf.create_dataset('x', data=test_unit_14_x, compression='gzip', compression_opts=9)
    hdf.create_dataset('y', data=test_unit_14_y, compression='gzip', compression_opts=9)
with h5py.File('data\\test_unit_15_30.h5', 'w') as hdf:
    hdf.create_dataset('x', data=test_unit_15_x, compression='gzip', compression_opts=9)
    hdf.create_dataset('y', data=test_unit_15_y, compression='gzip', compression_opts=9)