In [1]:
import pandas as pd
import numpy as np

from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.model_selection import KFold

from sklearn.linear_model import Ridge, ElasticNet
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [2]:
# Load data
# ======================
df = pd.read_csv(
    "continuous_factory_process.csv",
    parse_dates=["time_stamp"]
)

df = df.sort_values("time_stamp").reset_index(drop=True)
df

Unnamed: 0,time_stamp,AmbientConditions.AmbientHumidity.U.Actual,AmbientConditions.AmbientTemperature.U.Actual,Machine1.RawMaterial.Property1,Machine1.RawMaterial.Property2,Machine1.RawMaterial.Property3,Machine1.RawMaterial.Property4,Machine1.RawMaterialFeederParameter.U.Actual,Machine1.Zone1Temperature.C.Actual,Machine1.Zone2Temperature.C.Actual,...,Stage2.Output.Measurement10.U.Actual,Stage2.Output.Measurement10.U.Setpoint,Stage2.Output.Measurement11.U.Actual,Stage2.Output.Measurement11.U.Setpoint,Stage2.Output.Measurement12.U.Actual,Stage2.Output.Measurement12.U.Setpoint,Stage2.Output.Measurement13.U.Actual,Stage2.Output.Measurement13.U.Setpoint,Stage2.Output.Measurement14.U.Actual,Stage2.Output.Measurement14.U.Setpoint
0,2019-03-06 10:52:33,17.24,23.53,11.54,200,963.00,247,1241.26,72.0,72.3,...,0.00,7.93,0.00,5.65,0.00,1.85,0.00,2.89,0.00,11.71
1,2019-03-06 10:52:34,17.24,23.53,11.54,200,963.00,247,1246.09,72.0,72.3,...,0.00,7.93,0.00,5.65,0.00,1.85,0.00,2.89,0.00,11.71
2,2019-03-06 10:52:35,17.24,23.53,11.54,200,963.00,247,1246.29,72.0,72.3,...,0.00,7.93,0.00,5.65,0.00,1.85,0.00,2.89,0.00,11.71
3,2019-03-06 10:52:36,17.24,23.53,11.54,200,963.00,247,1247.59,72.0,72.3,...,0.00,7.93,0.00,5.65,0.00,1.85,0.00,2.89,0.00,11.71
4,2019-03-06 10:52:37,17.24,23.53,11.54,200,963.00,247,1252.83,72.1,72.4,...,0.00,7.93,0.00,5.65,0.00,1.85,0.00,2.89,0.00,11.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14083,2019-03-06 14:47:16,13.84,24.43,12.22,201,1048.06,252,1263.76,72.0,71.8,...,7.87,7.93,5.53,5.65,2.07,1.85,3.80,2.89,7.56,11.71
14084,2019-03-06 14:47:17,13.84,24.43,12.22,201,1048.06,252,1255.42,72.0,71.8,...,7.85,7.93,5.48,5.65,2.01,1.85,3.79,2.89,7.76,11.71
14085,2019-03-06 14:47:18,13.84,24.43,12.22,201,1048.06,252,1257.52,72.0,71.8,...,7.93,7.93,5.54,5.65,2.06,1.85,3.83,2.89,7.66,11.71
14086,2019-03-06 14:47:19,13.84,24.43,12.22,201,1048.06,252,1261.99,72.0,71.9,...,7.85,7.93,5.51,5.65,2.05,1.85,3.83,2.89,7.87,11.71


In [3]:
# Column groups
# ======================
stage1_targets = [
    c for c in df.columns
    if "Stage1.Output.Measurement" in c and "Actual" in c
]

stage2_targets = [
    c for c in df.columns
    if "Stage2.Output.Measurement" in c and "Actual" in c
]

non_feature_cols = (
    ["time_stamp"]
    + stage1_targets
    + stage2_targets
    + [c for c in df.columns if "Setpoint" in c]
)

X = df.drop(columns=non_feature_cols)
Y_stage1 = df[stage1_targets]
Y_stage2 = df[stage2_targets]
X

Unnamed: 0,AmbientConditions.AmbientHumidity.U.Actual,AmbientConditions.AmbientTemperature.U.Actual,Machine1.RawMaterial.Property1,Machine1.RawMaterial.Property2,Machine1.RawMaterial.Property3,Machine1.RawMaterial.Property4,Machine1.RawMaterialFeederParameter.U.Actual,Machine1.Zone1Temperature.C.Actual,Machine1.Zone2Temperature.C.Actual,Machine1.MotorAmperage.U.Actual,...,Machine4.Temperature4.C.Actual,Machine4.Temperature5.C.Actual,Machine4.ExitTemperature.U.Actual,Machine5.Temperature1.C.Actual,Machine5.Temperature2.C.Actual,Machine5.Temperature3.C.Actual,Machine5.Temperature4.C.Actual,Machine5.Temperature5.C.Actual,Machine5.Temperature6.C.Actual,Machine5.ExitTemperature.U.Actual
0,17.24,23.53,11.54,200,963.00,247,1241.26,72.0,72.3,48.03,...,21.0,260.0,35.0,309.8,289.9,263.9,238.6,245.0,66.1,50.0
1,17.24,23.53,11.54,200,963.00,247,1246.09,72.0,72.3,48.03,...,23.0,263.0,35.0,309.8,289.9,263.9,238.6,245.0,66.1,49.8
2,17.24,23.53,11.54,200,963.00,247,1246.29,72.0,72.3,48.16,...,22.0,266.0,35.0,309.8,289.9,263.9,238.6,245.0,66.1,49.1
3,17.24,23.53,11.54,200,963.00,247,1247.59,72.0,72.3,48.57,...,23.0,269.0,35.0,309.8,289.9,263.9,238.6,245.0,66.1,49.6
4,17.24,23.53,11.54,200,963.00,247,1252.83,72.1,72.4,48.57,...,24.0,272.0,35.0,309.8,289.9,263.9,238.6,245.0,66.1,49.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14083,13.84,24.43,12.22,201,1048.06,252,1263.76,72.0,71.8,73.46,...,18.0,311.0,195.0,310.0,289.9,270.0,244.7,245.0,63.7,155.4
14084,13.84,24.43,12.22,201,1048.06,252,1255.42,72.0,71.8,73.32,...,19.0,311.0,195.0,310.0,289.9,270.0,244.7,245.0,63.7,155.1
14085,13.84,24.43,12.22,201,1048.06,252,1257.52,72.0,71.8,73.59,...,17.0,311.0,195.0,310.0,289.9,270.0,244.7,245.0,63.7,155.1
14086,13.84,24.43,12.22,201,1048.06,252,1261.99,72.0,71.9,74.40,...,18.0,311.0,195.0,310.0,289.9,270.0,244.7,245.0,63.7,155.4


In [4]:
Y_stage1

Unnamed: 0,Stage1.Output.Measurement0.U.Actual,Stage1.Output.Measurement1.U.Actual,Stage1.Output.Measurement2.U.Actual,Stage1.Output.Measurement3.U.Actual,Stage1.Output.Measurement4.U.Actual,Stage1.Output.Measurement5.U.Actual,Stage1.Output.Measurement6.U.Actual,Stage1.Output.Measurement7.U.Actual,Stage1.Output.Measurement8.U.Actual,Stage1.Output.Measurement9.U.Actual,Stage1.Output.Measurement10.U.Actual,Stage1.Output.Measurement11.U.Actual,Stage1.Output.Measurement12.U.Actual,Stage1.Output.Measurement13.U.Actual,Stage1.Output.Measurement14.U.Actual
0,12.72,0.0,12.16,21.97,0.00,0.0,3.82,2.94,20.82,17.30,8.06,5.54,1.66,2.69,14.51
1,12.34,0.0,0.00,17.78,0.00,0.0,3.97,0.00,19.65,17.31,7.55,5.13,1.04,2.70,0.00
2,12.34,0.0,0.00,17.78,0.00,0.0,3.97,0.00,19.65,17.31,7.55,5.13,1.04,2.70,0.00
3,12.34,0.0,0.00,17.78,0.00,0.0,3.97,0.00,19.65,17.31,7.55,5.13,1.04,2.70,0.00
4,12.29,0.0,0.00,17.83,31.44,0.0,3.87,0.00,0.00,17.30,7.54,5.15,1.15,2.72,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14083,12.61,0.0,9.14,21.50,31.55,0.0,2.10,3.03,20.88,18.95,7.60,0.00,1.42,3.41,0.00
14084,12.61,0.0,9.14,21.50,31.55,0.0,2.10,3.03,20.88,18.95,7.60,0.00,1.42,3.41,0.00
14085,12.61,0.0,9.14,21.50,31.55,0.0,2.10,3.03,20.88,18.95,7.60,0.00,1.42,3.41,0.00
14086,12.58,0.0,8.99,21.84,31.55,0.0,2.05,2.91,20.91,18.84,7.66,0.00,1.40,3.30,0.00


In [5]:
Y_stage2

Unnamed: 0,Stage2.Output.Measurement0.U.Actual,Stage2.Output.Measurement1.U.Actual,Stage2.Output.Measurement2.U.Actual,Stage2.Output.Measurement3.U.Actual,Stage2.Output.Measurement4.U.Actual,Stage2.Output.Measurement5.U.Actual,Stage2.Output.Measurement6.U.Actual,Stage2.Output.Measurement7.U.Actual,Stage2.Output.Measurement8.U.Actual,Stage2.Output.Measurement9.U.Actual,Stage2.Output.Measurement10.U.Actual,Stage2.Output.Measurement11.U.Actual,Stage2.Output.Measurement12.U.Actual,Stage2.Output.Measurement13.U.Actual,Stage2.Output.Measurement14.U.Actual
0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00
1,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00
2,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00
3,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00
4,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14083,12.91,6.40,10.95,19.07,0.0,3.09,0.23,2.97,20.08,0.0,7.87,5.53,2.07,3.80,7.56
14084,12.90,6.47,10.89,19.11,0.0,2.94,0.21,2.97,19.57,0.0,7.85,5.48,2.01,3.79,7.76
14085,13.08,6.44,10.86,19.42,0.0,3.03,0.27,2.96,19.18,0.0,7.93,5.54,2.06,3.83,7.66
14086,13.11,6.41,10.79,19.45,0.0,2.96,0.46,2.92,19.30,0.0,7.85,5.51,2.05,3.83,7.87


In [6]:
# Time features (KEEP for Stage 1)
# ======================
time_features = ['hour', 'dayofweek', 'month', 'hour_sin', 'hour_cos']

X['hour'] = df['time_stamp'].dt.hour
X['dayofweek'] = df['time_stamp'].dt.dayofweek
X['month'] = df['time_stamp'].dt.month
X['hour_sin'] = np.sin(2*np.pi*X['hour']/24)
X['hour_cos'] = np.cos(2*np.pi*X['hour']/24)

In [7]:
# Lag / delta features for Stage 1 outputs
# ======================
for lag in [1, 2]:
    for col in stage1_targets:
        X[f"{col}_lag{lag}"] = df[col].shift(lag)
        if lag == 1:
            X[f"{col}_delta1"] = df[col] - df[col].shift(1)

X = X.bfill()

In [8]:
# Time-based split
# ======================
split_idx = int(len(X) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
Y1_train, Y1_test = Y_stage1.iloc[:split_idx], Y_stage1.iloc[split_idx:]
Y2_train, Y2_test = Y_stage2.iloc[:split_idx], Y_stage2.iloc[split_idx:]

In [9]:
# Models (UNCHANGED)
# ======================
models = {
    "Ridge": Pipeline([
        ("scaler", StandardScaler()),
        ("model", Ridge(alpha=15.0))
    ]),

    "ElasticNet": Pipeline([
        ("scaler", StandardScaler()),
        ("model", ElasticNet(alpha=0.05, l1_ratio=0.5, max_iter=5000))
    ]),

    "ElasticNetChain": RegressorChain(
        estimator=Pipeline([
            ("scaler", StandardScaler()),
            ("model", ElasticNet(alpha=0.05, l1_ratio=0.5, max_iter=5000))
        ]),
        order=None
    ),

    "PLS": Pipeline([
        ("scaler", StandardScaler()),
        ("model", PLSRegression(n_components=7, scale=False))
    ]),

    "RandomForest": RandomForestRegressor(
        n_estimators=500,
        min_samples_leaf=7,
        max_features='sqrt',
        n_jobs=-1,
        random_state=42
    ),

    "XGBoostMultiOutput": MultiOutputRegressor(
        XGBRegressor(
            n_estimators=120,
            max_depth=3,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=5,
            tree_method='hist',
            n_jobs=-1,
            random_state=42,
            verbosity=0
        )
    )
}

In [10]:
# Metric
# ======================
def median_rmse(y_true, y_pred):
    rmse = np.sqrt(
        mean_squared_error(y_true, y_pred, multioutput="raw_values")
    )
    return np.median(rmse)

In [11]:
# Stage 1
# ======================
print("==== STAGE 1 RESULTS ====")
stage1_predictions = {}

for name, model in models.items():
    m = clone(model)
    m.fit(X_train, Y1_train)
    preds = m.predict(X_test)
    stage1_predictions[name] = preds
    print(f"{name:20s} | Median RMSE: {median_rmse(Y1_test, preds):.4f}")

==== STAGE 1 RESULTS ====
Ridge                | Median RMSE: 0.0135
ElasticNet           | Median RMSE: 0.0339
ElasticNetChain      | Median RMSE: 0.0333
PLS                  | Median RMSE: 0.6323
RandomForest         | Median RMSE: 0.4157
XGBoostMultiOutput   | Median RMSE: 0.0455


In [12]:
# Stage 2
# ======================
print("\n==== STAGE 2 RESULTS (CASCADED) ====")
kf = KFold(n_splits=5, shuffle=False)

for name, model in models.items():
    oof_preds = np.zeros(Y1_train.shape)

    for tr, val in kf.split(X_train):
        temp = clone(model)
        temp.fit(X_train.iloc[tr], Y1_train.iloc[tr])
        oof_preds[val] = temp.predict(X_train.iloc[val])

    X2_train = pd.concat(
        [X_train.drop(columns=time_features),
         pd.DataFrame(oof_preds, columns=stage1_targets, index=X_train.index)],
        axis=1
    )

    X2_test = pd.concat(
        [X_test.drop(columns=time_features),
         pd.DataFrame(stage1_predictions[name], columns=stage1_targets, index=X_test.index)],
        axis=1
    )

    X2_train = X2_train.bfill()
    X2_test = X2_test.bfill()

    m = clone(model)
    m.fit(X2_train, Y2_train)
    preds = m.predict(X2_test)

    print(f"{name:20s} | Median RMSE: {median_rmse(Y2_test, preds):.4f}")


==== STAGE 2 RESULTS (CASCADED) ====
Ridge                | Median RMSE: 0.8573
ElasticNet           | Median RMSE: 0.5966
ElasticNetChain      | Median RMSE: 0.5941
PLS                  | Median RMSE: 0.6423
RandomForest         | Median RMSE: 0.5603
XGBoostMultiOutput   | Median RMSE: 0.6423
