In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

# Paths
PROJECT = Path("C:/Users/Admin/Desktop/Projects/turbofan-health-explorer")
PROC = PROJECT / "data" / "processed"
REPORTS = PROJECT / "reports"
REPORTS.mkdir(exist_ok=True)

# Load labeled dataset
df = pd.read_parquet(PROC / "train_FD001_labeled.parquet")

print("Loaded:", df.shape)
df.head()


Loaded: (20631, 24)


Unnamed: 0,unit,cycle,op1,op2,op3,s2,s3,s4,s7,s8,...,s4_rollmean,s4_rollstd,s2_ema,s3_ema,s4_ema,s2_slope,s3_slope,s4_slope,RUL,health_stage
0,1,1,-0.0007,-0.0004,100.0,641.820007,1589.699951,1400.599976,554.359985,2388.060059,...,1400.599976,0.0,641.820007,1589.699951,1400.599976,,,,191,healthy
1,1,2,0.0019,-0.0003,100.0,642.150024,1591.819946,1403.140015,553.75,2388.040039,...,1401.869995,1.796079,641.851438,1589.901855,1400.841884,0.330017,2.119995,2.540039,190,healthy
2,1,3,-0.0043,0.0003,100.0,642.349976,1587.98999,1404.199951,554.26001,2388.080078,...,1402.646647,1.850004,641.898917,1589.719773,1401.1617,0.264984,-0.85498,1.799988,189,healthy
3,1,4,0.0007,0.0,100.0,642.349976,1582.790039,1401.869995,554.450012,2388.110107,...,1402.452484,1.559639,641.941875,1589.059798,1401.229157,0.178986,-2.455969,0.487,188,healthy
4,1,5,-0.0019,-0.0002,100.0,642.369995,1582.849976,1406.219971,554.0,2388.060059,...,1403.205981,2.159432,641.982649,1588.468387,1401.704472,0.129993,-2.272986,0.996997,187,healthy


In [3]:
units = df["unit"].unique()
n_units = len(units)

print(f"Total unique engines: {n_units}")
print("First 10 engine IDs:", units[:10])


Total unique engines: 100
First 10 engine IDs: [ 1  2  3  4  5  6  7  8  9 10]


In [4]:
from sklearn.model_selection import train_test_split

# Reproducible split
train_units, test_units = train_test_split(units, test_size=0.2, random_state=42)
train_units, val_units = train_test_split(train_units, test_size=0.1, random_state=42)

print(f"Train: {len(train_units)} units")
print(f"Val:   {len(val_units)} units")
print(f"Test:  {len(test_units)} units")


Train: 72 units
Val:   8 units
Test:  20 units


In [5]:
def assign_split(unit):
    if unit in train_units:
        return "train"
    elif unit in val_units:
        return "val"
    else:
        return "test"

df["split"] = df["unit"].apply(assign_split)

df["split"].value_counts()


split
train    14874
test      4070
val       1687
Name: count, dtype: int64

In [6]:
# Save unit ID lists for reproducibility
pd.Series(train_units).to_csv(PROC / "units_train.csv", index=False)
pd.Series(val_units).to_csv(PROC / "units_val.csv", index=False)
pd.Series(test_units).to_csv(PROC / "units_test.csv", index=False)

print("Saved split definitions:")
print(" - units_train.csv")
print(" - units_val.csv")
print(" - units_test.csv")


Saved split definitions:
 - units_train.csv
 - units_val.csv
 - units_test.csv


In [7]:
print("Split Checks:")
print("- No duplicates across splits:",
      len(set(train_units) & set(val_units) & set(test_units)) == 0)

print("- Train units:", len(train_units))
print("- Validation units:", len(val_units))
print("- Test units:", len(test_units))

print("\nCycle range per split:")
display(df.groupby("split")["cycle"].agg(["min", "max", "mean"]).round(1))

print("\nHealth stage distribution per split:")
display(df.groupby("split")["health_stage"].value_counts(normalize=True).mul(100).round(2))


Split Checks:
- No duplicates across splits: True
- Train units: 72
- Validation units: 8
- Test units: 20

Cycle range per split:


Unnamed: 0_level_0,min,max,mean
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,1,269,106.5
train,1,362,108.7
val,1,341,115.3



Health stage distribution per split:


split  health_stage
test   healthy         40.54
       near_fail       25.06
train  healthy         41.43
       near_fail       24.69
val    healthy         42.62
       near_fail       24.18
Name: proportion, dtype: float64

In [8]:
out_path = PROC / "train_FD001_split.parquet"
df.to_parquet(out_path, index=False)
print("Saved split dataset to:", out_path)


Saved split dataset to: C:\Users\Admin\Desktop\Projects\turbofan-health-explorer\data\processed\train_FD001_split.parquet


In [9]:
summary = pd.DataFrame({
    "Split": ["Train", "Validation", "Test"],
    "Num_Units": [len(train_units), len(val_units), len(test_units)],
    "Example_Units": [
        ", ".join(map(str, train_units[:5])),
        ", ".join(map(str, val_units[:5])),
        ", ".join(map(str, test_units[:5])),
    ]
})
display(summary)
summary.to_csv(REPORTS / "split_summary.csv", index=False)


Unnamed: 0,Split,Num_Units,Example_Units
0,Train,72,"70, 86, 98, 39, 3"
1,Validation,8,"25, 56, 8, 4, 36"
2,Test,20,"84, 54, 71, 46, 45"
