In [2]:
FEATURES = [
    "op1","op2","op3",
    "s2","s3","s4",
    "s2_rollmean","s3_rollmean","s4_rollmean",
    "s2_ema","s3_ema","s4_ema"
]
ROLL_WINDOW = 20
EMA_SPAN = 20
SPLIT_SEED = 42


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report

# Paths
PROJECT = Path("C:/Users/Admin/Desktop/Projects/turbofan-health-explorer")
PROC = PROJECT / "data" / "processed"
REPORTS = PROJECT / "reports"
REPORTS.mkdir(exist_ok=True)

# Load split dataset
df = pd.read_parquet(PROC / "train_FD001_split.parquet")

print("Loaded:", df.shape)
df.head()


Loaded: (20631, 25)


Unnamed: 0,unit,cycle,op1,op2,op3,s2,s3,s4,s7,s8,...,s4_rollstd,s2_ema,s3_ema,s4_ema,s2_slope,s3_slope,s4_slope,RUL,health_stage,split
0,1,1,-0.0007,-0.0004,100.0,641.820007,1589.699951,1400.599976,554.359985,2388.060059,...,0.0,641.820007,1589.699951,1400.599976,,,,191,healthy,test
1,1,2,0.0019,-0.0003,100.0,642.150024,1591.819946,1403.140015,553.75,2388.040039,...,1.796079,641.851438,1589.901855,1400.841884,0.330017,2.119995,2.540039,190,healthy,test
2,1,3,-0.0043,0.0003,100.0,642.349976,1587.98999,1404.199951,554.26001,2388.080078,...,1.850004,641.898917,1589.719773,1401.1617,0.264984,-0.85498,1.799988,189,healthy,test
3,1,4,0.0007,0.0,100.0,642.349976,1582.790039,1401.869995,554.450012,2388.110107,...,1.559639,641.941875,1589.059798,1401.229157,0.178986,-2.455969,0.487,188,healthy,test
4,1,5,-0.0019,-0.0002,100.0,642.369995,1582.849976,1406.219971,554.0,2388.060059,...,2.159432,641.982649,1588.468387,1401.704472,0.129993,-2.272986,0.996997,187,healthy,test


In [2]:
features = [
    "op1", "op2", "op3",      # operating settings
    "s2", "s3", "s4",         # informative sensors
    "s2_rollmean", "s3_rollmean", "s4_rollmean"
]

target_reg = "RUL"
target_cls = "health_stage"

print("Feature count:", len(features))


Feature count: 9


In [3]:
train_df = df[df["split"] == "train"].copy()
val_df   = df[df["split"] == "val"].copy()
test_df  = df[df["split"] == "test"].copy()

print(train_df.shape, val_df.shape, test_df.shape)


(14874, 25) (1687, 25) (4070, 25)


In [4]:
scaler = StandardScaler()

X_train = scaler.fit_transform(train_df[features])
X_val   = scaler.transform(val_df[features])
X_test  = scaler.transform(test_df[features])

y_train_reg = train_df[target_reg]
y_val_reg   = val_df[target_reg]
y_test_reg  = test_df[target_reg]

y_train_cls = train_df[target_cls]
y_val_cls   = val_df[target_cls]
y_test_cls  = test_df[target_cls]

print("Scaling complete — train mean ~0, std ~1")


Scaling complete — train mean ~0, std ~1


In [5]:
reg = LinearRegression()
reg.fit(X_train, y_train_reg)

y_pred_reg = reg.predict(X_test)

mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2  = r2_score(y_test_reg, y_pred_reg)

print(f"Linear Regression RUL MAE: {mae:.2f}")
print(f"R²: {r2:.3f}")


Linear Regression RUL MAE: 31.55
R²: 0.616


In [6]:
# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train_cls)
y_val_enc   = le.transform(y_val_cls)
y_test_enc  = le.transform(y_test_cls)

cls = LogisticRegression(max_iter=1000)
cls.fit(X_train, y_train_enc)

y_pred_cls = cls.predict(X_test)
acc = accuracy_score(y_test_enc, y_pred_cls)

print(f"Logistic Regression Accuracy: {acc:.3f}")
print("\nClass Report:\n", classification_report(y_test_enc, y_pred_cls, target_names=le.classes_))


Logistic Regression Accuracy: 0.676

Class Report:
               precision    recall  f1-score   support

     healthy       0.71      0.68      0.70      1650
   near_fail       0.81      0.86      0.83      1020

    accuracy                           0.68      4070
   macro avg       0.68      0.69      0.69      4070
weighted avg       0.67      0.68      0.68      4070



In [7]:
results = {
    "model": ["LinearRegression_RUL", "LogisticRegression_HealthStage"],
    "metric": ["MAE / R2", "Accuracy"],
    "value": [f"{mae:.2f} / {r2:.3f}", f"{acc:.3f}"]
}

df_results = pd.DataFrame(results)
df_results.to_csv(REPORTS / "baseline_metrics.csv", index=False)
display(df_results)


Unnamed: 0,model,metric,value
0,LinearRegression_RUL,MAE / R2,31.55 / 0.616
1,LogisticRegression_HealthStage,Accuracy,0.676


In [8]:
print("Quality Checks")
print("- Train MAE > 0 and R² > 0:", mae > 0 and r2 > 0)
print("- Accuracy > random guess (~33%):", acc > 0.33)
print("- Feature scaling mean ≈ 0:", np.allclose(X_train.mean(axis=0), 0, atol=1e-1))


Quality Checks
- Train MAE > 0 and R² > 0: True
- Accuracy > random guess (~33%): True
- Feature scaling mean ≈ 0: True
