In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib


In [18]:
# Update path if needed
DATA_PATH = "../data/gas_turbine_data.csv"

df = pd.read_csv(DATA_PATH, parse_dates=["timestamp"])
df = df.sort_values("timestamp").reset_index(drop=True)
np.random.seed(42)
df["compressor_risk"] = np.random.uniform(0, 1, len(df))
df.head()


Unnamed: 0,timestamp,asset_type,output_current,pump_voltage,bearing_vibration,exhaust_chemical_percentage,compressor_temperature,intake_air_temperature,bearing_risk,compressor_risk,pump_risk,exhaust_path_risk,cooling_or_lubrication_risk,shutdown_risk
0,2025-01-01 00:00:00,gas_turbine,124.967142,426.055909,0.041252,4.640714,525.565249,15.460962,0.412522,0.37454,0.174301,0.773452,0.706261,0.577214
1,2025-01-01 01:00:00,gas_turbine,118.617357,412.622677,0.035706,3.463604,543.536417,20.698075,0.357059,0.950714,0.342217,0.577267,0.67853,0.568994
2,2025-01-01 02:00:00,gas_turbine,126.476885,426.956847,0.041223,3.074956,529.066398,22.931972,0.41223,0.731994,0.163039,0.512493,0.706115,0.548749
3,2025-01-01 03:00:00,gas_turbine,135.230299,430.845103,0.045433,2.591771,534.566898,34.438438,0.45433,0.598658,0.114436,0.431962,0.727165,0.547815
4,2025-01-01 04:00:00,gas_turbine,117.658466,423.307479,0.040489,2.418116,573.469042,27.782766,0.404886,0.156019,0.208657,0.403019,0.702443,0.543988


In [19]:
required_columns = [
    "timestamp",
    "output_current",
    "pump_voltage",
    "bearing_vibration",
    "exhaust_chemical_percentage",
    "compressor_temperature",
    "intake_air_temperature",
    "pump_risk",
    "bearing_risk",
    "compressor_risk",
    "exhaust_path_risk",
    "cooling_or_lubrication_risk",
    "shutdown_risk"
]

missing = set(required_columns) - set(df.columns)
if missing:
    raise ValueError(f"Missing columns: {missing}")

# Forward fill missing values (time-series safe)
df = df.ffill()


In [20]:
WINDOW = 24      # rolling window (hours)
LAGS = [ 6, 24]
sensor_cols = [
    "output_current",
    "pump_voltage",
    "bearing_vibration",
    "exhaust_chemical_percentage",
    "compressor_temperature",
    "intake_air_temperature",
]

# Lag features
for col in sensor_cols:
    for lag in LAGS:
        df[f"{col}_lag_{lag}"] = df[col].shift(lag)

# Rolling mean features
for col in sensor_cols:
    df[f"{col}_roll_mean_{WINDOW}"] = df[col].rolling(WINDOW).mean()

# Derived physical features
df["temp_delta"] = df["compressor_temperature"] - df["intake_air_temperature"]
df["vibration_per_current"] = df["bearing_vibration"] / df["output_current"]


In [22]:
df["compressor_temp_delta"] = (
    df["compressor_temperature"] - df["intake_air_temperature"]
)
df["temp_per_current"] = (
    df["compressor_temperature"] / df["output_current"]
)

In [23]:
df = df.sort_values("timestamp").reset_index(drop=True)
df = df.tail(500).reset_index(drop=True)
df.shape

(500, 36)

In [24]:
df = df.dropna().reset_index(drop=True)

df.head()

Unnamed: 0,timestamp,asset_type,output_current,pump_voltage,bearing_vibration,exhaust_chemical_percentage,compressor_temperature,intake_air_temperature,bearing_risk,compressor_risk,...,output_current_roll_mean_24,pump_voltage_roll_mean_24,bearing_vibration_roll_mean_24,exhaust_chemical_percentage_roll_mean_24,compressor_temperature_roll_mean_24,intake_air_temperature_roll_mean_24,temp_delta,vibration_per_current,compressor_temp_delta,temp_per_current
0,2025-01-05 04:00:00,gas_turbine,105.846293,415.818216,0.033319,3.918173,551.41226,21.465537,0.333191,0.031429,...,118.296823,421.152046,0.039778,3.48214,541.73009,24.859453,529.946723,0.000315,529.946723,5.209557
1,2025-01-05 05:00:00,gas_turbine,115.793547,428.392074,0.04992,3.299708,559.363693,29.277778,0.499204,0.63641,...,118.24614,420.917626,0.040027,3.479025,541.32586,24.922477,530.085915,0.000431,530.085915,4.830698
2,2025-01-05 06:00:00,gas_turbine,116.572855,414.36525,0.03825,4.203875,533.373832,33.247404,0.382504,0.314356,...,118.065109,420.370952,0.039875,3.550439,542.165275,25.217364,500.126428,0.000328,500.126428,4.575455
3,2025-01-05 07:00:00,gas_turbine,111.977227,408.73231,0.032443,3.721785,527.755265,30.353053,0.324425,0.508571,...,118.55898,419.477613,0.039688,3.579827,541.077645,25.492185,497.402213,0.00029,497.402213,4.713059
4,2025-01-05 08:00:00,gas_turbine,118.387143,407.546967,0.045365,3.435619,518.296989,21.351988,0.453651,0.907566,...,118.583308,418.718179,0.039856,3.553023,540.678237,25.090073,496.945002,0.000383,496.945002,4.377984


In [25]:
target_cols = [
    "pump_risk",
    "bearing_risk",
    "compressor_risk",
    "exhaust_path_risk",
    "cooling_or_lubrication_risk",
    "shutdown_risk",
]

feature_cols = [
    c for c in df.columns
    if c not in target_cols + ["timestamp", "asset_type"]
]

X = df[feature_cols]
y = df[target_cols]


In [26]:
split_idx = int(len(df) * 0.8)

X_train, X_val = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]


In [27]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

model = MultiOutputRegressor(
    RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    )
)

model.fit(X_train, y_train)


0,1,2
,estimator,RandomForestR...ndom_state=42)
,n_jobs,

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred, multioutput="raw_values")

for name, score in zip(target_cols, mae):
    print(f"{name}: MAE = {score:.4f}")


pump_risk: MAE = 0.0023
bearing_risk: MAE = 0.0011
compressor_risk: MAE = 0.2630
exhaust_path_risk: MAE = 0.0015
cooling_or_lubrication_risk: MAE = 0.0006
shutdown_risk: MAE = 0.0081


In [29]:
import joblib

joblib.dump(
    {
        "model": model,
        "features": feature_cols,
        "targets": target_cols,
    },
    "turbine_risk_model.joblib"
)


['turbine_risk_model.joblib']