In [1]:
import time
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

In [2]:
df = pd.read_csv("train_preprocessed.csv")

In [3]:
inputs = df.drop(["Recovery Index", "Id"], axis=1)
target = df["Recovery Index"]
X = inputs.values
y = target.values
print(f"Samples: {X.shape[0]}, Features: {X.shape[1]}")

Samples: 8000, Features: 5


In [4]:
k_values = [5,10,15,18,20,25]
cv_summary = {}
base = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
for k in k_values:
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    start = time.time()
    scores = cross_val_score(base, X, y, scoring="neg_mean_squared_error", cv=kf, n_jobs=-1)
    elapsed = time.time() - start
    rmses = np.sqrt(-scores)
    cv_summary[k] = {"mean_rmse": rmses.mean(), "std_rmse": rmses.std(), "time_sec": elapsed}
    print(f"k={k}: RMSE mean={rmses.mean():.4f}, std={rmses.std():.4f}, time={elapsed:.1f}s")

k=5: RMSE mean=2.3458, std=0.0443, time=2.6s
k=10: RMSE mean=2.3547, std=0.0541, time=2.7s
k=15: RMSE mean=2.3461, std=0.0944, time=3.6s
k=18: RMSE mean=2.3558, std=0.0923, time=2.4s
k=20: RMSE mean=2.3542, std=0.0844, time=2.7s
k=25: RMSE mean=2.3528, std=0.0934, time=3.3s


In [5]:
best_k = min(cv_summary, key=lambda kk: cv_summary[kk]["mean_rmse"])
print(f"Selected k = {best_k}")

Selected k = 5


In [6]:
param_grid = {
    "n_estimators": [50, 100, 150, 200],
    "max_depth": [None,9, 10, 20],
    "max_features": ["sqrt", 0.8],
    "min_samples_leaf": [1, 2, 4]
}
kf = KFold(n_splits=best_k, shuffle=True, random_state=42)
rfr = RandomForestRegressor(random_state=42, n_jobs=-1)
gscv = GridSearchCV(rfr, param_grid, scoring="neg_mean_squared_error", cv=kf, n_jobs=-1, verbose=1)
start = time.time()
gscv.fit(X, y)
print(f"GridSearchCV finished in {time.time() - start:.1f}s")


Fitting 5 folds for each of 96 candidates, totalling 480 fits


GridSearchCV finished in 34.6s


In [7]:
best_params = gscv.best_params_
best_cv_rmse = np.sqrt(-gscv.best_score_)
print(f"Best params: {best_params}, CV RMSE: {best_cv_rmse:.4f}")

Best params: {'max_depth': 10, 'max_features': 0.8, 'min_samples_leaf': 4, 'n_estimators': 200}, CV RMSE: 2.2078


In [8]:
final_rf = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
final_rf.fit(X, y)
print("Final RandomForest trained on full dataset")

Final RandomForest trained on full dataset


In [9]:
preds=final_rf.predict(X)
results_df=pd.DataFrame({
    'Id':df['Id'],
    'Actual RI':y,
    'Predicted RI':preds,
    'Absolute Error':abs(preds-y)
})
print("First 10 rows: actual vs predicted")
results_df.head(10)


First 10 rows: actual vs predicted


Unnamed: 0,Id,Actual RI,Predicted RI,Absolute Error
0,9255,36,34.379082,1.620918
1,1562,25,25.310262,0.310262
2,1671,59,58.718747,0.281253
3,6088,22,21.173292,0.826708
4,6670,40,39.719329,0.280671
5,5934,33,34.993699,1.993699
6,8830,30,29.556383,0.443617
7,7946,89,89.722107,0.722107
8,3509,46,44.013512,1.986488
9,2003,83,79.975595,3.024405


In [10]:
mse_full = mean_squared_error(results_df['Actual RI'], results_df['Predicted RI'])
rmse_full = np.sqrt(mse_full)
print(f"RMSE for the data ->{rmse_full}")
print(f"MSE for the data ->{mse_full}")

RMSE for the data ->1.7826636684768726
MSE for the data ->3.177889754907421


In [11]:
joblib.dump({"model": final_rf}, "random_forest_final.joblib", compress=3)
print("Saved random_forest_final.joblib")

Saved random_forest_final.joblib
