In [1]:
import xgboost as xgb
from xgboost import XGBRegressor
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import numpy as np
from pathlib import Path
import torch

project_root = Path().resolve().parents[1]
sys.path.append(str(project_root))

from scripts.add_rul_column import add_rul_column
from scripts.get_last_cycle_per_unit import get_last_cycle_per_unit
from scripts.evaluate_regression import evaluate_regression

In [2]:
# Load the dataset
test_data = pd.read_csv('../../data/raw/test_FD001.txt', sep=' ', header=None)
train_data = pd.read_csv('../../data/raw/train_FD001.txt', sep=' ', header=None)
rul_data = pd.read_csv("../../data/raw/rul_FD001.txt", header=None, names=["rul"])

In [3]:
train_data.dropna(axis=1, how='all', inplace=True) # Drop columns with all NaN values
cols = ['unit', 'cycle'] + [f'op_setting_{i}' for i in range(1, 4)] + [f'sensor_{i}' for i in range(1, 22)] # Column names
train_data.columns = cols

train_data.head()

Unnamed: 0,unit,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [4]:
# Drop useless columns
drop_columns_list = ['op_setting_3', 'sensor_1', 'sensor_5', 'sensor_6', 'sensor_10', 'sensor_16', 'sensor_18', 'sensor_19']

train_data.drop(columns=drop_columns_list, inplace=True)

In [5]:
rul_max = 125
rul_df = train_data.groupby('unit')['cycle'].max().reset_index()
rul_df.columns = ['unit', 'max_cycle']
train_data = train_data.merge(rul_df, on='unit', how='left')
train_data['rul'] = train_data['max_cycle'] - train_data['cycle']
train_data['rul'] = train_data['rul'].clip(upper=rul_max)

In [6]:
# Get list of unique engine IDs
engine_ids = train_data['unit'].unique()

# Split engine IDs (not rows!)
train_ids, test_ids = train_test_split(engine_ids, test_size=0.2, random_state=42)

train_df = train_data[train_data['unit'].isin(train_ids)].copy()
test_df = train_data[train_data['unit'].isin(test_ids)].copy()

In [7]:
feature_cols = [col for col in train_df.columns 
                if col not in ['unit', 'cycle', 'max_cycle', 'rul']]

In [8]:
def create_sliding_windows(df, window_size, feature_cols):
    sequences = []
    targets = []

    for engine_id in df['unit'].unique():
        engine_df = df[df['unit'] == engine_id].reset_index(drop=True)
        num_cycles = engine_df.shape[0]

        if num_cycles < window_size:
            continue  # optionally skip short sequences

        for i in range(num_cycles - window_size + 1):
            window = engine_df.loc[i:i+window_size-1, feature_cols].values
            target = engine_df.loc[i+window_size-1, 'rul']
            sequences.append(window)
            targets.append(target)

    X = np.array(sequences)
    y = np.array(targets)
    return X, y


In [9]:
window_size = 30
X_train_seq, y_train_seq = create_sliding_windows(train_df, window_size, feature_cols)
X_val_seq, y_val_seq = create_sliding_windows(test_df, window_size, feature_cols)
