In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer, mean_squared_error

RANDOM_STATE = 42

In [2]:
# Load the processed training data
df_train = pd.read_csv('../data/train_processed.csv')

features = ['Therapy Hours', 'Initial Health Score', 'Lifestyle Activities', 'Average Sleep Hours', 'Follow-Up Sessions']
target = 'Recovery Index'

X = df_train[features]
y = df_train[target]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

Features (X) shape: (8000, 5)
Target (y) shape: (8000,)


In [None]:
# --- Model 6: Random Forest Regressor ---

print("Starting Model 6: Random Forest Tuning...")
print("This may take several minutes...")

#Initialize the model
model_rf = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)

#Define the parameters to search
#n_estimators: Number of trees in the forest.
#max_depth: Max depth of each tree.
#min_samples_leaf: Min patients required in a final leaf.
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [10, 20, None],
    'min_samples_leaf': [5, 10]
}

# Set up GridSearchCV for RMSE
# NOTE: We are using cv=10 as required, but for RF, this is slow.
# verbose=1 will print updates so you know it's not stuck.
search_rmse = GridSearchCV(
    model_rf,
    param_grid,
    cv=10, 
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1 # This will print updates!
)

search_rmse.fit(X, y)

print("\n--- Random Forest Results (RMSE) ---")
print(f"Best Parameters found: {search_rmse.best_params_}")
best_rmse = np.sqrt(-search_rmse.best_score_)
print(f"Average 10-Fold RMSE (from best model): {best_rmse:.3f}")

#Set up GridSearchCV for R-squared
search_r2 = GridSearchCV(
    model_rf,
    param_grid,
    cv=10,
    scoring='r2',
    n_jobs=-1
)
search_r2.fit(X, y)

print("\n--- Random Forest Results (R-squared) ---")
print(f"Average 10-Fold R-squared (from best model): {search_r2.best_score_:.3f}")

Starting Model 5: Random Forest Tuning...
This may take several minutes...
Fitting 10 folds for each of 12 candidates, totalling 120 fits

--- Random Forest Results (RMSE) ---
Best Parameters found: {'max_depth': None, 'min_samples_leaf': 5, 'n_estimators': 150}
Average 10-Fold RMSE (from best model): 2.213

--- Random Forest Results (R-squared) ---
Average 10-Fold R-squared (from best model): 0.987


In [None]:
# --- Random Forest Submission Cell ---

print("Loading test data...")
X_test = pd.read_csv('../data/test_processed.csv')
test_ids = pd.read_csv('../data/test_ids.csv')
print("Test data and IDs loaded.")

#Get the best parameters from the search
best_params = search_rmse.best_params_
print(f"\nTraining a new model with best params: {best_params}")

final_rf_model = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)

final_rf_model.set_params(**best_params)

final_rf_model.fit(X, y)
print("Model trained successfully.")

print("\nGenerating predictions...")
predictions = final_rf_model.predict(X_test)

print("Creating submission file...")
submission_df = pd.DataFrame({'Id': test_ids['Id'], 'Recovery Index': predictions})
submission_df['Recovery Index'] = submission_df['Recovery Index'].round().astype(int)

submission_path = '../submission/submission_model6.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nRandom Forest submission file saved to: {submission_path}")
submission_df.head()

Loading test data...
Test data and IDs loaded.

Training a new model with best params: {'max_depth': None, 'min_samples_leaf': 5, 'n_estimators': 150}
Model trained successfully.

Generating predictions...
Creating submission file...

Random Forest submission file saved to: ../submission/submission_random_forest.csv


Unnamed: 0,Id,Recovery Index
0,6253,56
1,4685,23
2,1732,48
3,4743,29
4,4522,43
