In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
import time

In [3]:
df=pd.read_csv("train_preprocessed.csv")
df

Unnamed: 0,Id,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index
0,9255,0.006455,-1.183844,-0.989307,0.269888,0.134041,36
1,1562,-1.149747,-1.241506,1.010808,0.269888,0.483562,25
2,1671,-1.149747,0.661318,-0.989307,0.269888,-0.914520,59
3,6088,-1.149747,-1.356828,-0.989307,-0.318906,-1.264041,22
4,6670,1.162658,-1.299167,-0.989307,1.447477,-1.613561,40
...,...,...,...,...,...,...,...
7995,5735,1.162658,-1.126183,1.010808,-0.318906,0.483562,48
7996,5192,-0.378946,-0.088279,-0.989307,1.447477,-0.565000,51
7997,5391,1.548059,-1.241506,-0.989307,0.269888,0.483562,44
7998,861,-1.535148,-1.299167,-0.989307,1.447477,-1.613561,20


In [4]:
inputs=df.drop(["Recovery Index", "Id"], axis='columns')
target=df['Recovery Index']
X=inputs.values
y=target.values
print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")

Samples: 8000, Features: 5


In [None]:
import joblib
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
import numpy as np


print(f"Running RandomizedSearchCV on {X.shape[0]} samples, {X.shape[1]} features")

# 1) search distribution (wider & includes pruning parameter ccp_alpha)
param_dist = {
    "max_depth": [None, 3, 5, 7, 9, 12],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": [None, "sqrt", 0.8],
    "ccp_alpha": [0.0, 1e-5, 1e-4, 1e-3, 1e-2]
}

# 2) more stable CV for hyperparameter search
search_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

rnd = RandomizedSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=40,                    
    scoring="neg_mean_squared_error",
    cv=search_cv,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

start = time.time()
rnd.fit(X, y)
print(f"RandomizedSearchCV done in {time.time() - start:.1f}s")
best_params = rnd.best_params_
best_cv_rmse = np.sqrt(-rnd.best_score_)
print("Best params:", best_params)
print(f"Best CV RMSE (search): {best_cv_rmse:.4f}")

# 3) get out-of-fold (OOF) predictions using a single KFold (unbiased estimate)
oof_cv = KFold(n_splits=10, shuffle=True, random_state=42)
best_model = rnd.best_estimator_

# cross_val_predict requires that each sample is in test set exactly once -> use plain KFold
oof_preds = cross_val_predict(best_model, X, y, cv=oof_cv, n_jobs=-1)

mse_oof = mean_squared_error(y, oof_preds)
rmse_oof = np.sqrt(mse_oof)
mae_oof = mean_absolute_error(y, oof_preds)
print(f"OOF metrics -> RMSE: {rmse_oof:.4f}, MSE: {mse_oof:.4f}, MAE: {mae_oof:.4f}")

# 4) fit final model on full data with best params and save
final_dt = DecisionTreeRegressor(**best_params, random_state=42)
final_dt.fit(X, y)
joblib.dump({"model": final_dt}, "decision_tree_final_tuned.joblib", compress=3)
print("Saved tuned final model to decision_tree_final_tuned.joblib")

# 5) optional: produce CSV of predictions vs actual (inspect training fit)
preds_full = final_dt.predict(X)
results_df = pd.DataFrame({
    "Id": df["Id"] if "Id" in df.columns else np.arange(len(y)),
    "Actual_RI": y,
    "Predicted_RI": preds_full,
    "Abs_Error": np.abs(preds_full - y)
})
results_df.to_csv("dt_tuned_predictions_vs_actual.csv", index=False)
print("Saved dt_tuned_predictions_vs_actual.csv (first 5 rows):")
print(results_df.head(5))

Running RandomizedSearchCV on 8000 samples, 5 features
Fitting 30 folds for each of 40 candidates, totalling 1200 fits
RandomizedSearchCV done in 3.7s
Best params: {'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': None, 'max_depth': None, 'ccp_alpha': 0.0001}
Best CV RMSE (search): 2.4341
OOF metrics -> RMSE: 2.4361, MSE: 5.9344, MAE: 1.9350
Saved tuned final model to decision_tree_final_tuned.joblib
Saved dt_tuned_predictions_vs_actual.csv (first 5 rows):
     Id  Actual_RI  Predicted_RI  Abs_Error
0  9255         36     35.125000   0.875000
1  1562         25     26.095238   1.095238
2  1671         59     59.000000   0.000000
3  6088         22     20.400000   1.600000
4  6670         40     39.909091   0.090909
