In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [25]:
# Cell 1: Imports and global configuration

import numpy as np
import pandas as pd
import time

from xgboost import XGBRegressor

from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt

# Reproducibility
RANDOM_STATE = 42

# Sliding window config for baseline (change as in your paper)
WINDOW_SIZE = 30   # number of cycles in each window
STRIDE = 1         # step size for sliding window

# File paths - adjust if your folder structure is different
TRAIN_PATH = "C:/Users/hrast/OneDrive/Desktop/Minor Project/CMaps/train_FD001.txt"
TEST_PATH  = "C:/Users/hrast/OneDrive/Desktop/Minor Project/CMaps/test_FD001.txt"
RUL_PATH   = "C:/Users/hrast/OneDrive/Desktop/Minor Project/CMaps/RUL_FD001.txt"


In [14]:
# Cell 3: Compute RUL for each cycle in the training set

def add_rul_to_train(train_df):
    """
    For each engine (unit_nr), the training data runs to actual failure.
    If an engine fails at cycle T_max, then at cycle t the RUL is:

        RUL(t) = T_max - t

    This function computes that RUL label for every cycle.
    """
    # Maximum cycle per engine = failure cycle
    max_cycle = train_df.groupby("unit_nr")["time_cycles"].max().rename("max_cycle")

    # Attach max_cycle to every row
    df = train_df.merge(max_cycle, on="unit_nr", how="left")

    # Compute Remaining Useful Life
    df["RUL"] = df["max_cycle"] - df["time_cycles"]

    # Drop helper column
    df = df.drop(columns=["max_cycle"])

    return df


train_labeled = add_rul_to_train(train_raw)
train_labeled.head()

Unnamed: 0,unit_nr,time_cycles,os_1,os_2,os_3,s_1,s_2,s_3,s_4,s_5,...,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [15]:
# Cell 4: Get final-cycle rows in test set and attach true RUL

def get_test_final_cycles(test_df, rul_df):
    """
    For each engine in the test set, we only know the RUL at its last observed cycle
    (given by RUL_FD001.txt).

    This function:
    - Finds the last cycle per engine in test_df
    - Keeps only that row per engine
    - Adds the corresponding RUL from rul_df
    """
    # Find last observed cycle per engine
    last_cycles = test_df.groupby("unit_nr")["time_cycles"].max().rename("last_cycle")

    # Attach last_cycle
    test_last = test_df.merge(last_cycles, on="unit_nr", how="left")

    # Filter rows where time_cycles == last_cycle => final cycle per engine
    test_last = test_last[test_last["time_cycles"] == test_last["last_cycle"]].copy()

    # Clean up
    test_last.drop(columns=["last_cycle"], inplace=True)

    # Ensure sorted by unit_nr to match RUL_FD001 order
    test_last = test_last.sort_values("unit_nr").reset_index(drop=True)
    rul_df = rul_df.reset_index(drop=True)

    # Attach true RUL from file
    test_last["RUL"] = rul_df["RUL"]

    return test_last


test_last_labeled = get_test_final_cycles(test_raw, rul_test_final)
test_last_labeled.head()


Unnamed: 0,unit_nr,time_cycles,os_1,os_2,os_3,s_1,s_2,s_3,s_4,s_5,...,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21,RUL
0,1,31,-0.0006,0.0004,100.0,518.67,642.58,1581.22,1398.91,14.62,...,2388.06,8130.11,8.4024,0.03,393,2388,100.0,38.81,23.3552,112
1,2,49,0.0018,-0.0001,100.0,518.67,642.55,1586.59,1410.83,14.62,...,2388.09,8126.9,8.4505,0.03,391,2388,100.0,38.81,23.2618,98
2,3,126,-0.0016,0.0004,100.0,518.67,642.88,1589.75,1418.89,14.62,...,2388.14,8131.46,8.4119,0.03,395,2388,100.0,38.93,23.274,69
3,4,106,0.0012,0.0004,100.0,518.67,642.78,1594.53,1406.88,14.62,...,2388.11,8133.64,8.4634,0.03,395,2388,100.0,38.58,23.2581,82
4,5,98,-0.0013,-0.0004,100.0,518.67,642.27,1589.94,1419.36,14.62,...,2388.15,8125.74,8.4362,0.03,394,2388,100.0,38.75,23.4117,91


In [16]:
# Cell 5: Define which columns are sensors and operating settings

SENSOR_COLS  = [f"s_{i}" for i in range(1, 22)]  # 21 sensors
SETTING_COLS = ["os_1", "os_2", "os_3"]          # 3 operating settings

print("Number of sensors:", len(SENSOR_COLS))
print("Number of settings:", len(SETTING_COLS))


Number of sensors: 21
Number of settings: 3


In [26]:
# Cell 6: Sliding-window rolling-statistics features (baseline features)

def generate_rolling_features_for_unit(df_unit, window=WINDOW_SIZE, stride=STRIDE):
    """
    Generate rolling-window statistical features for ONE engine (one unit_nr).

    For each window (of length 'window' cycles):
    - Input: all sensor and setting values in that window
    - Output: one feature vector with:
        * unit_nr, time_cycles (at window end)
        * RUL at window end (for training)
        * For each column c in SENSOR_COLS + SETTING_COLS:
            - c_mean, c_std, c_min, c_max, c_skew, c_kurt

    The label is the RUL at the last cycle in the window.
    """
    # Make sure data is sorted in time
    df_unit = df_unit.sort_values("time_cycles").reset_index(drop=True)

    feature_rows = []
    n = len(df_unit)

    # end_idx starts at window-1 so that we always have 'window' points
    for end_idx in range(window - 1, n, stride):
        start_idx = end_idx - window + 1

        # Select the window slice
        window_slice = df_unit.iloc[start_idx:end_idx + 1]

        # Create a dict to store features for this window
        row_features = {
            "unit_nr": df_unit.loc[end_idx, "unit_nr"],
            "time_cycles": df_unit.loc[end_idx, "time_cycles"],
            # RUL label at window end (works for training data;
            # for test we will ignore this and override later)
            "RUL": df_unit.loc[end_idx, "RUL"],
        }

        # Compute statistics for each sensor + setting within this window
        for col in SENSOR_COLS + SETTING_COLS:
            values = window_slice[col].values.astype(float)

            # Basic statistics
            row_features[f"{col}_mean"] = values.mean()
            row_features[f"{col}_std"]  = values.std(ddof=1) if len(values) > 1 else 0.0
            row_features[f"{col}_min"]  = values.min()
            row_features[f"{col}_max"]  = values.max()

            # Higher-order statistics (handle short windows safely)
            if len(values) > 3:
                row_features[f"{col}_skew"] = skew(values, bias=False)
                row_features[f"{col}_kurt"] = kurtosis(values, fisher=True, bias=False)
            else:
                row_features[f"{col}_skew"] = 0.0
                row_features[f"{col}_kurt"] = 0.0

        feature_rows.append(row_features)

    return pd.DataFrame(feature_rows)


def generate_rolling_features(df, window=WINDOW_SIZE, stride=STRIDE):
    """
    Apply rolling-window feature extraction to ALL engines in a dataframe.

    We group by unit_nr and call generate_rolling_features_for_unit for each group,
    then concatenate all feature rows.
    """
    all_features = []

    for unit_id, df_unit in df.groupby("unit_nr"):
        feats_unit = generate_rolling_features_for_unit(df_unit, window, stride)
        all_features.append(feats_unit)

    features_df = pd.concat(all_features, axis=0).reset_index(drop=True)
    return features_df

# Generate baseline rolling features for the labeled training data
# and measure the time taken (Baseline Serial time for the paper)

start_time = time.time()

train_features = generate_rolling_features(train_labeled)

end_time = time.time()
baseline_train_time = end_time - start_time

print(f"Baseline (Serial) TRAIN feature extraction time: {baseline_train_time:.2f} seconds")
print("train_features shape:", train_features.shape)

# Optional: quick sanity check preview
train_features.head()

  row_features[f"{col}_skew"] = skew(values, bias=False)
  row_features[f"{col}_kurt"] = kurtosis(values, fisher=True, bias=False)


Baseline (Serial) TRAIN feature extraction time: 701.10 seconds
train_features shape: (17731, 147)


Unnamed: 0,unit_nr,time_cycles,RUL,s_1_mean,s_1_std,s_1_min,s_1_max,s_1_skew,s_1_kurt,s_2_mean,...,os_2_min,os_2_max,os_2_skew,os_2_kurt,os_3_mean,os_3_std,os_3_min,os_3_max,os_3_skew,os_3_kurt
0,1,30,162,518.67,0.0,518.67,518.67,,,642.328333,...,-0.0004,0.0005,0.083536,-0.926274,100.0,0.0,100.0,100.0,,
1,1,31,161,518.67,0.0,518.67,518.67,,,642.335,...,-0.0004,0.0005,0.06784,-0.917315,100.0,0.0,100.0,100.0,,
2,1,32,160,518.67,0.0,518.67,518.67,,,642.341,...,-0.0004,0.0005,0.06784,-0.917315,100.0,0.0,100.0,100.0,,
3,1,33,159,518.67,0.0,518.67,518.67,,,642.353,...,-0.0004,0.0005,0.159762,-0.906411,100.0,0.0,100.0,100.0,,
4,1,34,158,518.67,0.0,518.67,518.67,,,642.359333,...,-0.0004,0.0005,0.194326,-0.922053,100.0,0.0,100.0,100.0,,


In [27]:
# Cell 7: Prepare feature matrix (X), labels (y), and groups (engine IDs)

# Columns we do NOT want to use as input features
non_feature_cols = ["unit_nr", "time_cycles", "RUL"]

# All other columns in train_features are engineered features
feature_cols = [c for c in train_features.columns if c not in non_feature_cols]

# X = feature matrix (inputs to the model)
X = train_features[feature_cols].values

# y = target labels (RUL at the end of each window)
y = train_features["RUL"].values

# groups = engine IDs for GroupKFold, so windows from the same engine stay together
groups = train_features["unit_nr"].values

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Number of features:", len(feature_cols))
print("Example feature names:", feature_cols[:10])


X shape: (17731, 144)
y shape: (17731,)
Number of features: 144
Example feature names: ['s_1_mean', 's_1_std', 's_1_min', 's_1_max', 's_1_skew', 's_1_kurt', 's_2_mean', 's_2_std', 's_2_min', 's_2_max']


In [28]:
# Cell 8: GroupKFold cross-validation for baseline model

from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

def rmse(y_true, y_pred):
    """Root Mean Squared Error."""
    return np.sqrt(mean_squared_error(y_true, y_pred))

# 5-fold GroupKFold: split by engine id (groups)
gkf = GroupKFold(n_splits=5)

cv_rmse_scores = []
cv_mae_scores  = []

fold_idx = 1

for train_idx, val_idx in gkf.split(X, y, groups):
    # Split features and labels according to indices
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    # Baseline XGBoost model (you can tune these later)
    model = XGBRegressor(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        objective="reg:squarederror",
        n_jobs=-1,
        random_state=RANDOM_STATE,
    )

    # Train on training split
    model.fit(X_tr, y_tr)

    # Predict on validation split
    y_val_pred = model.predict(X_val)

    # Compute metrics
    fold_rmse = rmse(y_val, y_val_pred)
    fold_mae  = mean_absolute_error(y_val, y_val_pred)

    cv_rmse_scores.append(fold_rmse)
    cv_mae_scores.append(fold_mae)

    print(f"Fold {fold_idx}: RMSE = {fold_rmse:.3f}, MAE = {fold_mae:.3f}")
    fold_idx += 1

print("\n===== Baseline CV performance (rolling stats features) =====")
print("CV RMSE: mean = {:.3f}, std = {:.3f}".format(
    np.mean(cv_rmse_scores), np.std(cv_rmse_scores)
))
print("CV MAE : mean = {:.3f}, std = {:.3f}".format(
    np.mean(cv_mae_scores), np.std(cv_mae_scores)
))

Fold 1: RMSE = 45.090, MAE = 29.555
Fold 2: RMSE = 40.163, MAE = 27.074
Fold 3: RMSE = 37.428, MAE = 25.493
Fold 4: RMSE = 40.681, MAE = 29.015
Fold 5: RMSE = 36.154, MAE = 25.298

===== Baseline CV performance (rolling stats features) =====
CV RMSE: mean = 39.903, std = 3.091
CV MAE : mean = 27.287, std = 1.752


In [31]:
# Cell 9: Train final baseline model on ALL training data

baseline_model = XGBRegressor(
    n_estimators=400,          # more trees since we're training on full data
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="reg:squarederror",
    n_jobs=-1,
    random_state=RANDOM_STATE,
)

# Train model on ALL training windows
start = time.time()
baseline_model.fit(X, y)
end = time.time()

print("Final baseline model trained successfully.")
print(f"Baseline model training time: {end - start:.2f} seconds")

Final baseline model trained successfully.
Baseline model training time: 4.14 seconds


In [32]:
# Cell 10: Generate rolling features for the TEST set (baseline features)
#          and measure Baseline (Serial) TEST feature extraction time.

def add_dummy_rul(test_df):
    """
    For the TEST set, we do NOT know the RUL at every cycle.
    We only know the RUL at the LAST cycle of each engine (from RUL_FD001.txt).

    However, our rolling feature function expects a 'RUL' column to exist.
    For the TEST set we simply add a dummy RUL column (e.g., zeros).
    We will NOT use these dummy RUL values as labels; we only care about
    the engineered features.
    """
    df = test_df.copy()
    df["RUL"] = 0  # placeholder; real RUL labels will be attached later only for final windows
    return df


# Add dummy RUL so generate_rolling_features() can run on the test data
test_with_dummy_rul = add_dummy_rul(test_raw)

# Measure time for Baseline (Serial) TEST feature extraction
start_time = time.time()

# Generate rolling-window features for ALL test engines
test_features_full = generate_rolling_features(test_with_dummy_rul)

end_time = time.time()
baseline_test_time = end_time - start_time

print(f"Baseline (Serial) TEST feature extraction time: {baseline_test_time:.2f} seconds")
print("test_features_full shape:", test_features_full.shape)

# Optional: quick sanity check preview
test_features_full.head()

  row_features[f"{col}_skew"] = skew(values, bias=False)
  row_features[f"{col}_kurt"] = kurtosis(values, fisher=True, bias=False)


Baseline (Serial) TEST feature extraction time: 416.19 seconds
test_features_full shape: (10196, 147)


Unnamed: 0,unit_nr,time_cycles,RUL,s_1_mean,s_1_std,s_1_min,s_1_max,s_1_skew,s_1_kurt,s_2_mean,...,os_2_min,os_2_max,os_2_skew,os_2_kurt,os_3_mean,os_3_std,os_3_min,os_3_max,os_3_skew,os_3_kurt
0,1,30,0,518.67,0.0,518.67,518.67,,,642.310333,...,-0.0005,0.0005,-0.29213,-0.567482,100.0,0.0,100.0,100.0,,
1,1,31,0,518.67,0.0,518.67,518.67,,,642.295667,...,-0.0005,0.0005,-0.251029,-0.601997,100.0,0.0,100.0,100.0,,
2,2,30,0,518.67,0.0,518.67,518.67,,,642.537333,...,-0.0004,0.0005,-0.364717,-1.005124,100.0,0.0,100.0,100.0,,
3,2,31,0,518.67,0.0,518.67,518.67,,,642.541,...,-0.0004,0.0005,-0.369689,-0.93382,100.0,0.0,100.0,100.0,,
4,2,32,0,518.67,0.0,518.67,518.67,,,642.55,...,-0.0004,0.0005,-0.369689,-0.93382,100.0,0.0,100.0,100.0,,


In [33]:
# Cell 11: Final TEST evaluation for the baseline model

# 1) For each test engine, keep only the LAST window (closest to final cycle)
#    This ensures we make exactly ONE prediction per engine,
#    which we can compare to RUL_FD001.txt.

# Find, for each unit_nr, the index of the row with the largest time_cycles
idx_last_per_unit = test_features_full.groupby("unit_nr")["time_cycles"].idxmax()

# Select those rows and sort by unit_nr to align with rul_test_final
test_features_last = (
    test_features_full.loc[idx_last_per_unit]
    .sort_values("unit_nr")
    .reset_index(drop=True)
)

print("test_features_last shape (should be 100 rows for FD001):", test_features_last.shape)

# 2) Attach the true RUL from RUL_FD001.txt as the label for evaluation

# Ensure RUL file is aligned (already loaded as rul_test_final earlier)
rul_ordered = rul_test_final.reset_index(drop=True)
test_features_last["RUL"] = rul_ordered["RUL"].values

# 3) Build TEST feature matrix X_test using the SAME feature_cols as training

X_test = test_features_last[feature_cols].values
y_test = test_features_last["RUL"].values

print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# 4) Predict RUL on the test set using the trained baseline model

y_test_pred = baseline_model.predict(X_test)

# 5) Evaluate TEST performance (RMSE and MAE at final cycle per engine)

test_rmse = rmse(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("\n===== Baseline TEST performance (final cycle per engine) =====")
print(f"Test RMSE: {test_rmse:.3f}")
print(f"Test MAE : {test_mae:.3f}")

test_features_last shape (should be 100 rows for FD001): (100, 147)
X_test shape: (100, 144)
y_test shape: (100,)

===== Baseline TEST performance (final cycle per engine) =====
Test RMSE: 29.009
Test MAE : 21.226
