In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib
from sklearn.preprocessing import MinMaxScaler
from custom_functions import *

In [5]:
data_1 = import_data('re_train_data/train_FD001.txt')
data_3 = import_data('re_train_data/train_FD003.txt')

print(data_1.head())

   engine_id  cycle  op_setting_1  op_setting_2  op_setting_3  sensor_1  \
0          1      1       -0.0007       -0.0004         100.0    518.67   
1          1      2        0.0019       -0.0003         100.0    518.67   
2          1      3       -0.0043        0.0003         100.0    518.67   
3          1      4        0.0007        0.0000         100.0    518.67   
4          1      5       -0.0019       -0.0002         100.0    518.67   

   sensor_2  sensor_3  sensor_4  sensor_5  ...  sensor_11  sensor_12  \
0    641.82   1589.70   1400.60     14.62  ...      47.47     521.66   
1    642.15   1591.82   1403.14     14.62  ...      47.49     522.28   
2    642.35   1587.99   1404.20     14.62  ...      47.27     522.42   
3    642.35   1582.79   1401.87     14.62  ...      47.13     522.86   
4    642.37   1582.85   1406.22     14.62  ...      47.28     522.19   

   sensor_13  sensor_14  sensor_15  sensor_17  sensor_18  sensor_19  \
0    2388.02    8138.62     8.4195        392

In [6]:
print(data_1['engine_id'].nunique())
print(data_3['engine_id'].nunique())

100
100


In [7]:
data_3['engine_id'] = data_1['engine_id'].replace([i for i in range(1, 101)],[i for i in range(101, 101+100)])

data = pd.concat([data_1,data_3],ignore_index=True)

### 1.6 Data Types and Missing Values

The `data.info()` output provides a summary of the DataFrame, including the number of non-null entries for each column and their data types.

**Observations:**
* **No Missing Values:** All columns show `160359 non-null` entries, confirming there are no missing values in the combined training dataset, which simplifies preprocessing.
* **Data Types:** Most sensor readings and operational settings are `float64`, while `engine_id`, `cycle`, `sensor_17`, and `sensor_18` are `int64`. These data types are appropriate for numerical analysis.
* **Memory Usage:** The DataFrame occupies approximately 29.4 MB of memory.

This inspection confirms the data's integrity and readiness for further processing.

In [8]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45351 entries, 0 to 45350
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   engine_id     41262 non-null  float64
 1   cycle         45351 non-null  int64  
 2   op_setting_1  45351 non-null  float64
 3   op_setting_2  45351 non-null  float64
 4   op_setting_3  45351 non-null  float64
 5   sensor_1      45351 non-null  float64
 6   sensor_2      45351 non-null  float64
 7   sensor_3      45351 non-null  float64
 8   sensor_4      45351 non-null  float64
 9   sensor_5      45351 non-null  float64
 10  sensor_6      45351 non-null  float64
 11  sensor_7      45351 non-null  float64
 12  sensor_8      45351 non-null  float64
 13  sensor_9      45351 non-null  float64
 14  sensor_11     45351 non-null  float64
 15  sensor_12     45351 non-null  float64
 16  sensor_13     45351 non-null  float64
 17  sensor_14     45351 non-null  float64
 18  sensor_15     45351 non-nu

### 1.6 Data Types and Missing Values

The `data.info()` output provides a summary of the DataFrame, including the number of non-null entries for each column and their data types.

**Observations:**
* **No Missing Values:** All columns show `160359 non-null` entries, confirming there are no missing values in the combined training dataset, which simplifies preprocessing.
* **Data Types:** Most sensor readings and operational settings are `float64`, while `engine_id`, `cycle`, `sensor_17`, and `sensor_18` are `int64`. These data types are appropriate for numerical analysis.
* **Memory Usage:** The DataFrame occupies approximately 29.4 MB of memory.

This inspection confirms the data's integrity and readiness for further processing.

In [9]:
print(data.describe())

          engine_id         cycle  op_setting_1  op_setting_2  op_setting_3  \
count  41262.000000  45351.000000  45351.000000  45351.000000       45351.0   
mean     101.506568    125.307049     -0.000017      0.000004         100.0   
std       57.916271     87.813757      0.002191      0.000294           0.0   
min        1.000000      1.000000     -0.008700     -0.000600         100.0   
25%       52.000000     57.000000     -0.001500     -0.000200         100.0   
50%      100.500000    114.000000      0.000000      0.000000         100.0   
75%      152.000000    174.000000      0.001500      0.000300         100.0   
max      200.000000    525.000000      0.008700      0.000700         100.0   

       sensor_1      sensor_2      sensor_3      sensor_4      sensor_5  ...  \
count  45351.00  45351.000000  45351.000000  45351.000000  4.535100e+04  ...   
mean     518.67    642.559339   1589.190970   1406.501317  1.462000e+01  ...   
std        0.00      0.524596      6.622906     

## 2. Feature Engineering: Remaining Useful Life (RUL) Calculation

Predicting RUL requires a target variable that represents the time remaining until engine failure. For the C-MAPSS dataset, this is not directly provided but can be derived. The `max_cycles` function (from `custom_functions.py`) calculates the maximum `cycle` for each `engine_id` and then computes the RUL as `max_cycle_for_engine - current_cycle`. This creates a linearly decreasing target variable for each engine, going from its maximum operational cycle down to 0 at failure.

The `head()` output below for `engine_id`, `cycle`, and `RUL` demonstrates this calculation for the first engine. As the `cycle` increases, the `RUL` value correctly decreases, starting from 191 cycles remaining down to 0 (which will be seen at the tail of the data for each engine). This is a critical step in preparing the target variable for our predictive model.

In [10]:
data = max_cycles(data)

# Inspecting the RUL column
print(data[['engine_id', 'cycle', 'RUL']].head())

   engine_id  cycle    RUL
0        1.0      1  191.0
1        1.0      2  190.0
2        1.0      3  189.0
3        1.0      4  188.0
4        1.0      5  187.0


### 2.1 Verifying RUL at End-of-Life

By inspecting the `tail()` of the combined dataset, we can observe the behavior of the `RUL` column, particularly for engines nearing their failure point. The output above shows the last few cycles of `engine_id` 609. As expected, the `RUL` value decreases to `0` at the final recorded cycle (`cycle` 255 for `engine_id` 609), confirming the correct calculation of the Remaining Useful Life. This validates that our target variable is appropriately defined for the prediction task.

In [11]:
# Show df.tail() for an engine to verify RUL decreases to 0
print(data.tail())

       engine_id  cycle  op_setting_1  op_setting_2  op_setting_3  sensor_1  \
45346        NaN    148       -0.0016       -0.0003         100.0    518.67   
45347        NaN    149        0.0034       -0.0003         100.0    518.67   
45348        NaN    150       -0.0016        0.0004         100.0    518.67   
45349        NaN    151       -0.0023        0.0004         100.0    518.67   
45350        NaN    152        0.0000        0.0003         100.0    518.67   

       sensor_2  sensor_3  sensor_4  sensor_5  ...  sensor_12  sensor_13  \
45346    643.78   1596.01   1424.11     14.62  ...     519.66    2388.30   
45347    643.29   1596.38   1429.14     14.62  ...     519.91    2388.28   
45348    643.84   1604.53   1431.41     14.62  ...     519.44    2388.24   
45349    643.94   1597.56   1426.57     14.62  ...     520.01    2388.26   
45350    643.64   1599.04   1436.06     14.62  ...     519.48    2388.24   

       sensor_14  sensor_15  sensor_17  sensor_18  sensor_19  sensor

### 2.2 Final Columns After RUL Addition

This output shows all the columns currently present in our consolidated training DataFrame, including the newly added `RUL` column. This confirms that all relevant features and the target variable are ready for the next stages of preprocessing, such as feature scaling and sequence generation.

In [12]:
# Inspecting the columns of dataset
print(data.columns)

Index(['engine_id', 'cycle', 'op_setting_1', 'op_setting_2', 'op_setting_3',
       'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
       'sensor_7', 'sensor_8', 'sensor_9', 'sensor_11', 'sensor_12',
       'sensor_13', 'sensor_14', 'sensor_15', 'sensor_17', 'sensor_18',
       'sensor_19', 'sensor_20', 'sensor_21', 'RUL'],
      dtype='object')


## 3. Data Preprocessing: Scaling and Sequence Generation

Neural networks, especially LSTMs, perform best when input features are scaled to a common range. This prevents features with larger numerical values from dominating the learning process.

### 3.1 Feature Scaling

We use `MinMaxScaler` from `sklearn.preprocessing` to scale all operational settings and sensor measurements (`feature_cols`) to a range between 0 and 1. The `RUL` target variable is also scaled using a separate `MinMaxScaler` (`rul_scaler`). This is important because RUL values can be large, and scaling them helps the model converge faster and more stably.

* **`feature_scaler.pkl`**: The scaler fitted on `feature_cols` from the training data is saved to ensure that the same scaling transformation can be applied to new, unseen test data.
* **`rul_scaler.pkl`**: Similarly, the scaler for the `RUL` target variable is saved. This will be crucial for inverse-transforming the model's predicted RUL values back to their original scale for meaningful interpretation.

The `head()` output above shows the training DataFrame after all features and the RUL target have been scaled. Notice how all values are now within the [0, 1] range.

In [13]:
# Defining feature columns
feature_cols = [col for col in data.columns if col not in ['RUL',"engine_id"]]
print(feature_cols)

# Scaling the features that will be used to train and test
scaler = MinMaxScaler()
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Scaling the labels that will be used to train
rul_scaler = MinMaxScaler()
data['RUL'] = rul_scaler.fit_transform(data['RUL'].values.reshape(-1, 1))

# Checking the dataframe after scaling
print('The Training Dataframe after Scaling:')
print(data.head())

['cycle', 'op_setting_1', 'op_setting_2', 'op_setting_3', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21']
The Training Dataframe after Scaling:
   engine_id     cycle  op_setting_1  op_setting_2  op_setting_3  sensor_1  \
0        1.0  0.000000      0.459770      0.153846           0.0       0.0   
1        1.0  0.001908      0.609195      0.230769           0.0       0.0   
2        1.0  0.003817      0.252874      0.692308           0.0       0.0   
3        1.0  0.005725      0.540230      0.461538           0.0       0.0   
4        1.0  0.007634      0.390805      0.307692           0.0       0.0   

   sensor_2  sensor_3  sensor_4  sensor_5  ...  sensor_12  sensor_13  \
0  0.229508  0.482798  0.365358       0.0  ...   0.198166   0.648810   
1  0.306792  0.523094  0.404780       0.0  ...   0.22

### 3.2 Saving Scalers

It is crucial to save the `MinMaxScaler` objects (`feature_scaler.pkl` and `rul_scaler.pkl`) after fitting them to the training data. This ensures that:
1.  **Consistency:** The exact same scaling transformation (based on the training data's min/max values) can be applied to the test data or any future unseen data.
2.  **Inverse Transformation:** The `rul_scaler` can be used to convert the model's predicted, scaled RUL values back into their original, interpretable cycle counts.

This practice prevents data leakage from the test set and allows for consistent deployment of the preprocessing pipeline.

In [14]:
# It's good practice to save your scalers for later use on test data and for inverse transformation.
# Example: saving the scalers (though not runnable without actual saving mechanism)
import joblib
joblib.dump(scaler, 'feature_scaler.pkl')
joblib.dump(rul_scaler, 'rul_scaler.pkl')

['rul_scaler.pkl']

## 4. Feature Engineering: Creating Rolling Window Statistics

To capture the temporal dependencies and trends in the sensor data, we generate **rolling mean** and **rolling standard deviation** features for the selected sensor measurements. A `window_size` of 30 cycles is used. This means for each cycle, the rolling features are calculated based on the preceding 30 cycles (including the current one).

* **Rolling Mean:** Provides a smoothed trend of the sensor readings, indicating general deterioration.
* **Rolling Standard Deviation:** Captures the variability or volatility of sensor readings within the window, which can be an indicator of increasing instability as an engine degrades.

The output above demonstrates the `sensor_2` original values alongside its 30-cycle rolling mean and standard deviation for the first engine. Notice how the rolling mean starts from the current value and gradually smooths out as more data points fill the window. The rolling standard deviation provides insight into the local variability of the sensor. These features are vital for LSTMs to learn patterns over time.

In [15]:
# Defining the necessary variables for Forging Insights
window_size = 30
selected_sensors = [col for col in feature_cols if col not in ['cycle','op_setting_1','op_setting_2','op_setting_3']]

data = rolling_mean_std(data,window_size,feature_cols)

print("\n--- Displaying Rolling Features for Engine 1 (first 30 cycles) ---")
# Show how the rolling features look for engine 1
engine1_df = data[data['engine_id'] == 1]
print(engine1_df[['engine_id', 'cycle', 'sensor_2', f'sensor_2_rolling_mean_{window_size}', f'sensor_2_rolling_std_{window_size}']].head(15))


--- Displaying Rolling Features for Engine 1 (first 30 cycles) ---
    engine_id     cycle  sensor_2  sensor_2_rolling_mean_30  \
0         1.0  0.000000  0.229508                  0.229508   
1         1.0  0.001908  0.306792                  0.268150   
2         1.0  0.003817  0.353630                  0.296643   
3         1.0  0.005725  0.353630                  0.310890   
4         1.0  0.007634  0.358314                  0.320375   
5         1.0  0.009542  0.295082                  0.316159   
6         1.0  0.011450  0.384075                  0.325861   
7         1.0  0.013359  0.402810                  0.335480   
8         1.0  0.015267  0.299766                  0.331512   
9         1.0  0.017176  0.203747                  0.318735   
10        1.0  0.019084  0.337237                  0.320417   
11        1.0  0.020992  0.285714                  0.317525   
12        1.0  0.022901  0.522248                  0.333273   
13        1.0  0.024809  0.353630                 

### 4.1 Updated Feature Set for Model Training

After generating the rolling mean and standard deviation for all relevant sensor features, our feature set (`feature_cols`) has significantly expanded. The output below confirms the new list of features that will be used as input to the LSTM model. It now includes the original `cycle` and `op_setting` features, plus the original sensor readings, and their corresponding rolling mean and standard deviation features. This comprehensive set aims to provide the model with a rich representation of the engine's health over time.

In [16]:
# updating the feature_cols
feature_cols = [col for col in data.columns if col not in ['engine_id','RUL']]
print(feature_cols)
print(f'({len(feature_cols)})')

['cycle', 'op_setting_1', 'op_setting_2', 'op_setting_3', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21', 'sensor_1_rolling_mean_30', 'sensor_1_rolling_std_30', 'sensor_2_rolling_mean_30', 'sensor_2_rolling_std_30', 'sensor_3_rolling_mean_30', 'sensor_3_rolling_std_30', 'sensor_4_rolling_mean_30', 'sensor_4_rolling_std_30', 'sensor_5_rolling_mean_30', 'sensor_5_rolling_std_30', 'sensor_6_rolling_mean_30', 'sensor_6_rolling_std_30', 'sensor_7_rolling_mean_30', 'sensor_7_rolling_std_30', 'sensor_8_rolling_mean_30', 'sensor_8_rolling_std_30', 'sensor_9_rolling_mean_30', 'sensor_9_rolling_std_30', 'sensor_11_rolling_mean_30', 'sensor_11_rolling_std_30', 'sensor_12_rolling_mean_30', 'sensor_12_rolling_std_30', 'sensor_13_rolling_mean_30', 'sensor_13_rolling_std_30', 'sensor_14_rolling_mean_30', 'sensor_1

## 5. Sequence Generation for LSTM Model

LSTMs require input data to be in a sequence format (samples, timesteps, features). For our RUL prediction task, each sequence represents a fixed "look-back window" of an engine's operational history. The `create_sequences` function (from `custom_functions.py`) is used to transform our flattened DataFrame into this 3D format.

For each engine:
1.  It iterates through the engine's data, creating sequences of `sequence_length` (e.g., 30) cycles.
2.  Each sequence's target label is the RUL value at the *end* of that sequence.

This approach allows the LSTM to learn the temporal patterns leading up to a specific RUL value.

In [17]:
# Define the sequence length (hyperparameter)
sequence_length = 30 # Let's start with a look-back window of cycles

# Initialize empty lists to store all sequences and labels from all engines
X_train_sequences = []
y_train_labels = []

# Group the DataFrame by engine_id and iterate through each group (each engine)
# This is crucial to keep sequences from different engines separate
for engine_id, engine_df in data.groupby('engine_id'):
    # Generate sequences and labels for the current engine
    sequences_X, labels_y = create_sequences(engine_df, sequence_length, feature_cols)
    
    # Extend the main lists with the sequences and labels from this engine
    X_train_sequences.extend(sequences_X)
    y_train_labels.extend(labels_y)

### 5.2 Final Data Shapes for LSTM Input

After sequence generation, the data is converted into NumPy arrays, which is the required input format for Keras/TensorFlow models.

The output confirms the shapes of our processed arrays:
* **`X_train` (Training Sequences):** `(142698, 30, 61)`
    * **142698:** Number of training samples (sequences).
    * **30:** `sequence_length` (timesteps per sequence). This means each input to the LSTM will consist of 30 historical cycles.
    * **61:** Number of features per timestep (the `feature_cols`).
* **`y_train` (Training Labels):** `(142698, 1)`
    * **142698:** Number of corresponding RUL labels for each sequence.
    * **1:** Each label is a single RUL value.
* **`X_test` (Test Sequences):** Will have a shape like `(num_test_engines, 30, 61)` (though not explicitly printed here, it's inferred).

These reshaped arrays are now in the correct format to be fed into an LSTM neural network for training and prediction. The saving of `X_train.npy`, `X_test.npy`, and `y_train.npy` ensures that these preprocessed datasets can be easily loaded for model training without re-running the entire preprocessing pipeline.

In [18]:
# Convert the lists into NumPy arrays for Keras
# This is where we get our 3D array (samples, timesteps, features)
X_train = np.array(X_train_sequences)
y_train = np.array(y_train_labels)

# Saving The Processed Test set for late use
np.save('X_train.npy', X_train)

# Reshape y_train to be 2D for model fitting (if not already)
# This step is good practice to ensure compatibility with Keras
y_train = y_train.reshape(-1, 1)
np.save('y_train.npy', y_train)

# Checking all the Outputs of create_sequences
print("Shape of X_train (sequences):", X_train.shape)
print("Shape of y_train (RUL labels):", y_train.shape)
print("\nFirst sequence from X_train:")
print(X_train[0])

Shape of X_train (sequences): (35462, 30, 61)
Shape of y_train (RUL labels): (35462, 1)

First sequence from X_train:
[[0.         0.45977011 0.15384615 ... 0.02481076 0.50691159 0.00301762]
 [0.0019084  0.6091954  0.23076923 ... 0.02481076 0.50904537 0.00301762]
 [0.00381679 0.25287356 0.69230769 ... 0.03220801 0.48520271 0.04135178]
 ...
 [0.05152672 0.36206897 0.84615385 ... 0.05019797 0.47580282 0.05024965]
 [0.05343511 0.56896552 0.38461538 ... 0.04949831 0.47497849 0.04954346]
 [0.05534351 0.37356322 0.46153846 ... 0.04938171 0.47579553 0.04888702]]
