In [12]:
import joblib
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import time
import pandas as pd


In [13]:
df = pd.read_csv("train_preprocessed.csv")


In [14]:
inputs = df.drop(["Recovery Index", "Id"], axis=1)
target = df["Recovery Index"]
X = inputs.values
y = target.values
print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")


Samples: 8000, Features: 5


In [None]:
print(f"Running RandomizedSearchCV on {X.shape[0]} samples, {X.shape[1]} features")

# 1) broader randomized search space for RF
param_dist = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [None, 10, 20, 30],
    "max_features": ["sqrt", 0.6, 0.8],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True]   
}

# 2) use RepeatedKFold for more stable CV during search
search_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

rfr_base = RandomForestRegressor(random_state=42, n_jobs=-1)

rnd = RandomizedSearchCV(
    rfr_base,
    param_distributions=param_dist,
    n_iter=40,                     
    scoring="neg_mean_squared_error",
    cv=search_cv,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

start = time.time()
rnd.fit(X, y)
print(f"RandomizedSearchCV done in {time.time() - start:.1f}s")
best_params = rnd.best_params_
best_cv_rmse = np.sqrt(-rnd.best_score_)
print("Best params:", best_params)
print(f"Best CV RMSE (search): {best_cv_rmse:.4f}")

# 3) Out‑of‑fold (OOF) predictions with a plain KFold for unbiased estimate
oof_cv = KFold(n_splits=10, shuffle=True, random_state=42)
best_model = rnd.best_estimator_
oof_preds = cross_val_predict(best_model, X, y, cv=oof_cv, n_jobs=-1)
mse_oof = mean_squared_error(y, oof_preds)
rmse_oof = np.sqrt(mse_oof)
mae_oof = mean_absolute_error(y, oof_preds)
print(f"OOF metrics -> RMSE: {rmse_oof:.4f}, MSE: {mse_oof:.4f}, MAE: {mae_oof:.4f}")

# 4) Fit final RandomForest on full data with best params and enable OOB

final_params = best_params.copy()
final_params.update({"random_state": 42, "n_jobs": -1, "oob_score": True})
final_rf = RandomForestRegressor(**final_params)
final_rf.fit(X, y)
print("Final RandomForest trained on full dataset")


if hasattr(final_rf, "oob_score_"):
    print(f"OOB R^2: {final_rf.oob_score_:.4f}")

# 5) Predictions on full dataset and save results + model
preds = final_rf.predict(X)
results_df = pd.DataFrame({
    "Id": df["Id"] if "Id" in df.columns else np.arange(len(y)),
    "Actual_Recovery_Index": y,
    "Predicted_Recovery_Index": preds
})
results_df["Error"] = results_df["Predicted_Recovery_Index"] - results_df["Actual_Recovery_Index"]
results_df["Absolute_Error"] = results_df["Error"].abs()
results_df.to_csv("rf_tuned_predictions_vs_actual.csv", index=False)
print("Saved rf_tuned_predictions_vs_actual.csv (first 5 rows):")
print(results_df.head(5))

mse_full = mean_squared_error(results_df["Actual_Recovery_Index"], results_df["Predicted_Recovery_Index"])
rmse_full = np.sqrt(mse_full)
mae_full = mean_absolute_error(results_df["Actual_Recovery_Index"], results_df["Predicted_Recovery_Index"])
print(f"Full-data metrics -> RMSE: {rmse_full:.4f}, MSE: {mse_full:.4f}, MAE: {mae_full:.4f}")

joblib.dump({"model": final_rf}, "random_forest_tuned.joblib", compress=3)
print("Saved tuned random forest to random_forest_tuned.joblib")


Running RandomizedSearchCV on 8000 samples, 5 features
Fitting 30 folds for each of 40 candidates, totalling 1200 fits
RandomizedSearchCV done in 217.3s
Best params: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 0.6, 'max_depth': 30, 'bootstrap': True}
Best CV RMSE (search): 2.1899
OOF metrics -> RMSE: 2.1889, MSE: 4.7913, MAE: 1.7371
Final RandomForest trained on full dataset
OOB R^2: 0.9870
Saved rf_tuned_predictions_vs_actual.csv (first 5 rows):
     Id  Actual_Recovery_Index  Predicted_Recovery_Index     Error  \
0  9255                     36                 34.737555 -1.262445   
1  1562                     25                 25.347348  0.347348   
2  1671                     59                 58.679904 -0.320096   
3  6088                     22                 21.425181 -0.574819   
4  6670                     40                 40.257802  0.257802   

   Absolute_Error  
0        1.262445  
1        0.347348  
2        0.320096  
3      