In [4]:
import time
import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df=pd.read_csv("train_preprocessed.csv")
df

Unnamed: 0,Id,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index
0,9255,0.006455,-1.183844,-0.989307,0.269888,0.134041,36
1,1562,-1.149747,-1.241506,1.010808,0.269888,0.483562,25
2,1671,-1.149747,0.661318,-0.989307,0.269888,-0.914520,59
3,6088,-1.149747,-1.356828,-0.989307,-0.318906,-1.264041,22
4,6670,1.162658,-1.299167,-0.989307,1.447477,-1.613561,40
...,...,...,...,...,...,...,...
7995,5735,1.162658,-1.126183,1.010808,-0.318906,0.483562,48
7996,5192,-0.378946,-0.088279,-0.989307,1.447477,-0.565000,51
7997,5391,1.548059,-1.241506,-0.989307,0.269888,0.483562,44
7998,861,-1.535148,-1.299167,-0.989307,1.447477,-1.613561,20


In [3]:
inputs=df.drop(["Recovery Index", "Id"], axis='columns')
target=df['Recovery Index']
X=inputs.values
y=target.values
print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")

Samples: 8000, Features: 5


In [11]:
k_values=[5,10,18,20,25,30]
cv_summary={}
model=BaggingRegressor(estimator=DecisionTreeRegressor(max_depth=9,random_state=42),
                       n_estimators=50,random_state=42,n_jobs=-1)

for k in k_values:
    kf=KFold(n_splits=k,shuffle=True,random_state=42)
    start=time.time()
    scores=cross_val_score(model,X,y,scoring='neg_mean_squared_error',cv=kf,n_jobs=-1)
    elapsed=time.time()-start
    rmses=np.sqrt(-scores)
    cv_summary[k]={'mean_rmse':rmses.mean(), 'std_rmse':rmses.std(),'time_sec':elapsed}
    print(f"k={k}: RMSE mean={rmses.mean():.4f}, std={rmses.std():.4f}, time={elapsed:.1f}s")



k=5: RMSE mean=2.2424, std=0.0486, time=2.7s
k=10: RMSE mean=2.2358, std=0.0604, time=1.7s
k=18: RMSE mean=2.2317, std=0.0860, time=1.6s
k=20: RMSE mean=2.2275, std=0.0813, time=0.8s
k=25: RMSE mean=2.2299, std=0.0872, time=1.0s
k=30: RMSE mean=2.2317, std=0.1096, time=1.2s


In [12]:
best_k=min(cv_summary,key=lambda kk: cv_summary[kk]['mean_rmse'])
print(f"Selected k = {best_k}")

Selected k = 20


In [15]:
param_grid={
    'n_estimators':[30,50,100],
    'max_samples':[0.6,0.8,1.0],
    'max_features':[0.6,0.8,1.0],
    'estimator__max_depth':[5,7,9,None]
}
kf=KFold(n_splits=best_k,shuffle=True,random_state=42)
bag=BaggingRegressor(estimator=DecisionTreeRegressor(random_state=42),
                     random_state=42,n_jobs=-1)
gscv=GridSearchCV(bag,param_grid,scoring='neg_mean_squared_error',cv=kf,n_jobs=-1,verbose=1)
start=time.time()
gscv.fit(X,y)
print(f"GridSearchCV finished in {time.time() - start:.1f}s")



Fitting 20 folds for each of 108 candidates, totalling 2160 fits
GridSearchCV finished in 113.0s


In [16]:
best_params=gscv.best_params_
best_rmse=np.sqrt(-gscv.best_score_)
print(f"Best params: {best_params}, CV RMSE: {best_rmse:.4f}")


Best params: {'estimator__max_depth': 9, 'max_features': 1.0, 'max_samples': 0.6, 'n_estimators': 100}, CV RMSE: 2.2027


In [19]:
depth = best_params.get('estimator__max_depth', None)
n_estimators = best_params['n_estimators']
max_samples = best_params['max_samples']
max_features = best_params['max_features']
base_dt = DecisionTreeRegressor(max_depth=depth, random_state=42)
final_bag = BaggingRegressor(
    estimator=base_dt,
    n_estimators=n_estimators,
    max_samples=max_samples,
    max_features=max_features,
    random_state=42,
    n_jobs=-1
)



In [20]:

final_bag.fit(X,y)
print("Final bagging model trained on full dataset")

Final bagging model trained on full dataset


In [22]:
preds=final_bag.predict(X)
results_df=pd.DataFrame({
    'Id':df['Id'],
    'Actual RI':y,
    'Predicted RI':preds,
    'Absolute Error':abs(preds-y)
})
print("First 10 rows: actual vs predicted")
results_df.head(10)


First 10 rows: actual vs predicted


Unnamed: 0,Id,Actual RI,Predicted RI,Absolute Error
0,9255,36,34.396849,1.603151
1,1562,25,25.477669,0.477669
2,1671,59,58.872819,0.127181
3,6088,22,21.195034,0.804966
4,6670,40,39.864783,0.135217
5,5934,33,34.858745,1.858745
6,8830,30,29.861095,0.138905
7,7946,89,89.564933,0.564933
8,3509,46,43.724817,2.275183
9,2003,83,79.622876,3.377124


In [27]:
mse_full = mean_squared_error(results_df['Actual RI'], results_df['Predicted RI'])
rmse_full = np.sqrt(mse_full)
print(f"RMSE for the data ->{rmse_full}")
print(f"MSE for the data ->{mse_full}")


RMSE for the data ->1.849399143170418
MSE for the data ->3.4202771907594762


In [29]:
joblib.dump({'model':final_bag},"bagging_method.joblib",compress=3)
print("Saved model to bagging_method.joblib")

Saved model to bagging_method.joblib
