In [1]:
import joblib
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
import numpy as np
import time
import pandas as pd


In [2]:
df=pd.read_csv("train_preprocessed.csv")
df


Unnamed: 0,Id,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index
0,9255,0.006455,-1.183844,-0.989307,0.269888,0.134041,36
1,1562,-1.149747,-1.241506,1.010808,0.269888,0.483562,25
2,1671,-1.149747,0.661318,-0.989307,0.269888,-0.914520,59
3,6088,-1.149747,-1.356828,-0.989307,-0.318906,-1.264041,22
4,6670,1.162658,-1.299167,-0.989307,1.447477,-1.613561,40
...,...,...,...,...,...,...,...
7995,5735,1.162658,-1.126183,1.010808,-0.318906,0.483562,48
7996,5192,-0.378946,-0.088279,-0.989307,1.447477,-0.565000,51
7997,5391,1.548059,-1.241506,-0.989307,0.269888,0.483562,44
7998,861,-1.535148,-1.299167,-0.989307,1.447477,-1.613561,20


In [None]:
inputs=df.drop(["Recovery Index", "Id"], axis='columns')
target=df['Recovery Index']
X=inputs.values
y=target.values
print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")

print(f"Running RandomizedSearchCV on {X.shape[0]} samples, {X.shape[1]} features")


param_dist = {
    "n_estimators": [30, 50, 100, 200],
    "max_samples": [0.5, 0.6, 0.8, 1.0],
    "max_features": [0.5, 0.6, 0.8, 1.0],
    "estimator__max_depth": [None, 5, 7, 9],
    "bootstrap": [True],         
    "estimator__ccp_alpha": [0.0, 1e-5, 1e-4]
}


search_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

base_bag = BaggingRegressor(
    estimator=DecisionTreeRegressor(random_state=42),
    random_state=42,
    n_jobs=-1
)

rnd = RandomizedSearchCV(
    base_bag,
    param_distributions=param_dist,
    n_iter=40,                       
    scoring="neg_mean_squared_error",
    cv=search_cv,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

start = time.time()
rnd.fit(X, y)
print(f"RandomizedSearchCV done in {time.time() - start:.1f}s")
best_params = rnd.best_params_
best_cv_rmse = np.sqrt(-rnd.best_score_)
print("Best params:", best_params)
print(f"Best CV RMSE (search): {best_cv_rmse:.4f}")


oof_cv = KFold(n_splits=10, shuffle=True, random_state=42)
best_model = rnd.best_estimator_   
oof_preds = cross_val_predict(best_model, X, y, cv=oof_cv, n_jobs=-1)
mse_oof = mean_squared_error(y, oof_preds)
rmse_oof = np.sqrt(mse_oof)
mae_oof = mean_absolute_error(y, oof_preds)
print(f"OOF metrics -> RMSE: {rmse_oof:.4f}, MSE: {mse_oof:.4f}, MAE: {mae_oof:.4f}")


depth = best_params.get("estimator__max_depth", None)
n_estimators = int(best_params["n_estimators"])
max_samples = best_params["max_samples"]
max_features = best_params["max_features"]
bootstrap = best_params.get("bootstrap", True)

final_base = DecisionTreeRegressor(max_depth=depth, random_state=42,
                                  ccp_alpha=best_params.get("estimator__ccp_alpha", 0.0))

final_bag = BaggingRegressor(
    estimator=final_base,
    n_estimators=n_estimators,
    max_samples=max_samples,
    max_features=max_features,
    bootstrap=bootstrap,
    oob_score=True,     
    random_state=42,
    n_jobs=-1
)

final_bag.fit(X, y)
print("Final bagging model trained on full dataset")


if hasattr(final_bag, "oob_prediction_") and final_bag.oob_prediction_ is not None:
    oob_pred = final_bag.oob_prediction_
    oob_mse = mean_squared_error(y, oob_pred)
    oob_rmse = np.sqrt(oob_mse)
    print(f"OOB RMSE: {oob_rmse:.4f}, OOB R^2: {final_bag.oob_score_:.4f}")


preds = final_bag.predict(X)
results_df = pd.DataFrame({
    "Id": df["Id"] if "Id" in df.columns else np.arange(len(y)),
    "Actual_RI": y,
    "Predicted_RI": preds,
    "Abs_Error": np.abs(preds - y)
})
results_df.to_csv("bagging_tuned_predictions_vs_actual.csv", index=False)
print("Saved bagging_tuned_predictions_vs_actual.csv (first 5 rows):")
print(results_df.head(5))

mse_full = mean_squared_error(results_df["Actual_RI"], results_df["Predicted_RI"])
rmse_full = np.sqrt(mse_full)
mae_full = mean_absolute_error(results_df["Actual_RI"], results_df["Predicted_RI"])
print(f"Full-data metrics -> RMSE: {rmse_full:.4f}, MSE: {mse_full:.4f}, MAE: {mae_full:.4f}")

joblib.dump({"model": final_bag}, "bagging_method_tuned.joblib", compress=3)
print("Saved tuned bagging model to bagging_method_tuned.joblib")

Samples: 8000, Features: 5
Running RandomizedSearchCV on 8000 samples, 5 features
Fitting 30 folds for each of 40 candidates, totalling 1200 fits
RandomizedSearchCV done in 38.3s
Best params: {'n_estimators': 50, 'max_samples': 0.6, 'max_features': 1.0, 'estimator__max_depth': 9, 'estimator__ccp_alpha': 1e-05, 'bootstrap': True}
Best CV RMSE (search): 2.2198
OOF metrics -> RMSE: 2.2154, MSE: 4.9078, MAE: 1.7573
Final bagging model trained on full dataset
OOB RMSE: 2.2237, OOB R^2: 0.9866
Saved bagging_tuned_predictions_vs_actual.csv (first 5 rows):
     Id  Actual_RI  Predicted_RI  Abs_Error
0  9255         36     34.359159   1.640841
1  1562         25     25.556852   0.556852
2  1671         59     58.906334   0.093666
3  6088         22     21.248536   0.751464
4  6670         40     40.040523   0.040523
Full-data metrics -> RMSE: 1.8553, MSE: 3.4422, MAE: 1.4790
Saved tuned bagging model to bagging_method_tuned.joblib
