Part 1: Load and Structure the Dataset

In [1]:
import pandas as pd
import numpy as np

# Column names
col_names = ["unit", "time"] + [f"setting{i}" for i in range(1, 4)] + [f"sensor{i}" for i in range(1, 22)]

# Load raw dataset
df = pd.read_csv("../data/raw/train_FD001.txt", sep=r"\s+", header=None, names=col_names)

df.head()


Unnamed: 0,unit,time,setting1,setting2,setting3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor12,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


Part 2: Calculate Remaining Useful Life (RUL)

In [2]:
# Get the maximum cycle (time) per engine unit
rul_df = df.groupby("unit")["time"].max().reset_index()
rul_df.columns = ["unit", "max_time"]

# Merge back to original dataframe to compute RUL = max_time - current_time
df = df.merge(rul_df, on="unit")
df["RUL"] = df["max_time"] - df["time"]

df.drop(columns=["max_time"], inplace=True)

df.head()


Unnamed: 0,unit,time,setting1,setting2,setting3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


Part 3: Create Binary Failure Label

In [3]:
# Binary label: 1 if RUL <= 20, else 0
failure_threshold = 20
df["fault"] = df["RUL"].apply(lambda x: 1 if x <= failure_threshold else 0)

df[["unit", "time", "RUL", "fault"]].head(10)


Unnamed: 0,unit,time,RUL,fault
0,1,1,191,0
1,1,2,190,0
2,1,3,189,0
3,1,4,188,0
4,1,5,187,0
5,1,6,186,0
6,1,7,185,0
7,1,8,184,0
8,1,9,183,0
9,1,10,182,0


Part 4: Drop Uninformative Sensors

In [4]:
useless_sensors = ["sensor1", "sensor5", "sensor10", "sensor16", "sensor18", "sensor19"]

df.drop(columns=useless_sensors, inplace=True)

Part 5: Export Preprocessed CSV

In [None]:
## Export cleaned dataset
df.to_csv("../data/processed_fd001.csv", index=False)
print("Saved to ../data/processed_fd001.csv")

Saved to ../data/processed_fd001.csv
