In [17]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
import time

In [16]:
df=pd.read_csv("train_preprocessed.csv")
df

Unnamed: 0,Id,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index
0,9255,0.006455,-1.183844,-0.989307,0.269888,0.134041,36
1,1562,-1.149747,-1.241506,1.010808,0.269888,0.483562,25
2,1671,-1.149747,0.661318,-0.989307,0.269888,-0.914520,59
3,6088,-1.149747,-1.356828,-0.989307,-0.318906,-1.264041,22
4,6670,1.162658,-1.299167,-0.989307,1.447477,-1.613561,40
...,...,...,...,...,...,...,...
7995,5735,1.162658,-1.126183,1.010808,-0.318906,0.483562,48
7996,5192,-0.378946,-0.088279,-0.989307,1.447477,-0.565000,51
7997,5391,1.548059,-1.241506,-0.989307,0.269888,0.483562,44
7998,861,-1.535148,-1.299167,-0.989307,1.447477,-1.613561,20


In [18]:
inputs=df.drop(["Recovery Index", "Id"], axis='columns')
target=df['Recovery Index']
X=inputs.values
y=target.values
print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")

Samples: 8000, Features: 5


In [19]:
k_values=[5,10,20]
cv_summary={}
for k in k_values:
    kf=KFold(n_splits=k,shuffle=True,random_state=42)
    start=time.time()

    model=DecisionTreeRegressor(max_depth=7,random_state=42)
    scores=cross_val_score(model,X,y,scoring='neg_mean_squared_error', cv=kf, n_jobs=-1)
    elapsed=time.time()-start
    rmses=np.sqrt(-scores)
    cv_summary[k]={'mean_rmse':rmses.mean(), 'std_rmse':rmses.std(),'time_sec':elapsed}
    print(f"k={k}: RMSE mean={rmses.mean():.4f}, std={rmses.std():.4f}, time={elapsed:.1f}s")

k=5: RMSE mean=2.7141, std=0.0571, time=1.6s
k=10: RMSE mean=2.7093, std=0.0676, time=1.0s
k=20: RMSE mean=2.7071, std=0.0915, time=0.8s


In [20]:
best_k=min(cv_summary,key=lambda kk: cv_summary[kk]['mean_rmse'])
print(f"Selected k={best_k}")


Selected k=20


In [21]:
param_grid={'max_depth':[3,5,7,9,12,None]}
kf=KFold(n_splits=best_k, shuffle=True,random_state=42)
gscv=GridSearchCV(DecisionTreeRegressor(random_state=42),
                  param_grid,
                  scoring='neg_mean_squared_error',
                  cv=kf,
                  n_jobs=-1)
start=time.time()
gscv.fit(X,y)
print(f"GridSearchCV finished in {time.time() - start:.1f}s")


GridSearchCV finished in 2.8s


In [22]:
best_params=gscv.best_params_
best_cv_rmse=np.sqrt(-gscv.best_score_)
print(f"Best params: {best_params}, CV RMSE: {best_cv_rmse:.4f}")


Best params: {'max_depth': 9}, CV RMSE: 2.4992


In [23]:
final_model=DecisionTreeRegressor(**best_params,random_state=42)
final_model.fit(X,y)
print("Final model trained on full dataset")


Final model trained on full dataset


In [24]:
preds=final_model.predict(X)


In [25]:
results_df=pd.DataFrame({
    'Id': df['Id'],
    'Actual_RI':y,
    'Predicted_RI':preds
})
print("First 10 rows: actual vs predicted")
display(results_df.head(10))


First 10 rows: actual vs predicted


Unnamed: 0,Id,Actual_RI,Predicted_RI
0,9255,36,34.72973
1,1562,25,25.758621
2,1671,59,58.5
3,6088,22,21.28
4,6670,40,37.5
5,5934,33,35.0
6,8830,30,28.833333
7,7946,89,92.391304
8,3509,46,43.518519
9,2003,83,82.266667


In [26]:
import joblib
joblib.dump(final_model,"decision_tree_final.joblib",compress=3)
print("Saved model to decision_tree_final.joblib")

Saved model to decision_tree_final.joblib
