In [None]:
import os
import time
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score
from sklearn.base import clone
import joblib

In [4]:
HAS_XGB = False
HAS_LGBM = False
try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except Exception:
    HAS_XGB = False

try:
    from lightgbm import LGBMRegressor
    HAS_LGBM = True
except Exception:
    HAS_LGBM = False

DATA_PATH = "train.csv"   # change if your file is elsewhere
RANDOM_STATE = 42
TEST_SIZE = 0.20
MODEL_SAVE_DIR = "./models"   
df = pd.read_csv(DATA_PATH)
possible_targets = ['Y1', 'Y2']
if all(t in df.columns for t in possible_targets):
    target_names = possible_targets
else:
    # fallback
    target_names = list(df.columns[-2:])
print("Target columns:", target_names)

feature_cols = [c for c in df.columns if c not in target_names]
print("Feature columns---({}): {}".format(len(feature_cols), feature_cols))

X = df[feature_cols].copy().apply(pd.to_numeric, errors='coerce')
y = df[target_names].copy().apply(pd.to_numeric, errors='coerce')
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
preprocessor = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])
base_estimators = [
    ("rf", RandomForestRegressor(n_estimators=150, random_state=RANDOM_STATE, n_jobs=-1)),
    ("hgb", HistGradientBoostingRegressor(max_iter=200, random_state=RANDOM_STATE))
]

base_estimators.append(("xgb", XGBRegressor(n_estimators=200, random_state=RANDOM_STATE, n_jobs=4, verbosity=0)))
base_estimators.append(("lgbm", LGBMRegressor(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)))
final_estimator = RidgeCV(alphas=[0.1, 1.0, 10.0])


Target columns: ['Y1', 'Y2']
Feature columns---(15): ['time', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N']


In [None]:
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
models = {}
r2_scores = {}
start_time = time.time()
for tgt in target_names:
    print(f"For {tgt}") 
    ytr = y_train[tgt]
    yva = y_val[tgt]
    stack = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator, n_jobs=-1, passthrough=False)
    pipeline = Pipeline([
        ("preproc", preprocessor),
        ("stack", stack)
    ])
    t0 = time.time()
    pipeline.fit(X_train, ytr)
    elapsed = time.time() - t0
    preds_val = pipeline.predict(X_val)
    r2 = r2_score(yva, preds_val)
    models[tgt] = pipeline
    r2_scores[tgt] = r2
    print(f"  -> Trained in {elapsed:.1f}s, Validation R2: {r2:.4f}")

total_elapsed = time.time() - start_time
print(f"\nTotal training time taken -: {total_elapsed:.1f}s")

print("\nFitting MultiOutput baseline (HistGradientBoosting)...")
multi_pipeline = Pipeline([
    ("preproc", preprocessor),
    ("multi", MultiOutputRegressor(HistGradientBoostingRegressor(max_iter=200, random_state=RANDOM_STATE)))
])
t0 = time.time()
multi_pipeline.fit(X_train, y_train)
multi_elapsed = time.time() - t0
multi_preds = multi_pipeline.predict(X_val)
multi_r2_Y1 = r2_score(y_val[target_names[0]], multi_preds[:, 0])
multi_r2_Y2 = r2_score(y_val[target_names[1]], multi_preds[:, 1])
print(f"  -> MultiOutput trained in {multi_elapsed:.1f}s, R2s: {target_names[0]}={multi_r2_Y1:.4f}, {target_names[1]}={multi_r2_Y2:.4f}")


For Y1


In [None]:

for tgt, mdl in models.items():
    path = os.path.join(MODEL_SAVE_DIR, f"stack_{tgt}.joblib")
    joblib.dump(mdl, path)
    print("Saved", path)

multi_path = os.path.join(MODEL_SAVE_DIR, "multi_histgb.joblib")
joblib.dump(multi_pipeline, multi_path)
print("Saved", multi_path)
print("\nFinal R2 scores (stacking per target):")
for tgt, val in r2_scores.items():
    print(f"  {tgt}: {val:.4f}")
print("\nMultiOutput baseline R2 scores:")
print(f"  {target_names[0]}: {multi_r2_Y1:.4f}")
print(f"  {target_names[1]}: {multi_r2_Y2:.4f}")
val_compare = pd.DataFrame({
    f"y_true_{target_names[0]}": y_val[target_names[0]].iloc[:10].values,
    f"y_pred_stack_{target_names[0]}": models[target_names[0]].predict(X_val)[:10],
    f"y_true_{target_names[1]}": y_val[target_names[1]].iloc[:10].values,
    f"y_pred_stack_{target_names[1]}": models[target_names[1]].predict(X_val)[:10]
})
print("\nSample validation comparison for the 10 first rows:")
print(val_compare.to_string(index=False))

print("\nModels saved to:", os.path.abspath(MODEL_SAVE_DIR))


In [None]:
test_df = pd.read_csv("test.csv")
print("Loaded test.csv with shape:", test_df.shape)
X_test = test_df[[c for c in feature_cols if c in test_df.columns]].copy()
preds = {}
for tgt in target_names:
    model = joblib.load(os.path.join(MODEL_SAVE_DIR, f"stack_{tgt}.joblib"))
    preds[tgt] = model.predict(X_test)

if "id" in test_df.columns:
    ids = test_df["id"].values  # preserve exact IDs
else:
    ids = np.arange(1, len(test_df) + 1)  
submission = pd.DataFrame({
    "id": ids,
    "Y1": preds[target_names[0]],
    "Y2": preds[target_names[1]],
})
submission.to_csv("predictions.csv", index=False)
print("Saved predictions.csv")

Loaded test.csv with shape: (15996, 16)
Saved predictions.csv
   id        Y1        Y2
0   1  0.492582 -0.151149
1   2 -0.269365 -0.341803
2   3 -0.331963 -0.080804
3   4 -0.487142  0.212209
4   5 -0.967007  0.122879
