In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor #"base" model
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer, mean_squared_error

RANDOM_STATE = 42

In [7]:
# Load the processed training data
df_train = pd.read_csv('../data/train_processed.csv')

features = ['Therapy Hours', 'Initial Health Score', 'Lifestyle Activities', 'Average Sleep Hours', 'Follow-Up Sessions']
target = 'Recovery Index'

X = df_train[features]
y = df_train[target]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

Features (X) shape: (8000, 5)
Target (y) shape: (8000,)


In [8]:
# --- Model 7: AdaBoost Regressor ---

print("Starting Model 7: AdaBoost Tuning...")
print("This may take a few minutes...")

# AdaBoost needs a base model. We'll use a simple Decision Tree, as this is standard practice.
base_estimator = DecisionTreeRegressor(max_depth=5) 

model_ada = AdaBoostRegressor(
    estimator=base_estimator, 
    random_state=RANDOM_STATE
)

# Define the parameters to search
# n_estimators: Number of trees to build sequentially.
# learning_rate: How much to "boost" or correct errors at each step.
param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1, 1.0]
}

# Set up GridSearchCV for RMSE
search_rmse = GridSearchCV(
    model_ada,
    param_grid,
    cv=10, 
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1 # This will print updates
)

# Run the tuning
search_rmse.fit(X, y)
print("\n--- AdaBoost Results (RMSE) ---")
print(f"Best Parameters found: {search_rmse.best_params_}")
best_rmse = np.sqrt(-search_rmse.best_score_)
print(f"Average 10-Fold RMSE (from best model): {best_rmse:.3f}")

Starting Model 7: AdaBoost Tuning...
This may take a few minutes...
Fitting 10 folds for each of 6 candidates, totalling 60 fits

--- AdaBoost Results (RMSE) ---
Best Parameters found: {'learning_rate': 1.0, 'n_estimators': 100}
Average 10-Fold RMSE (from best model): 2.380


In [9]:
# --- AdaBoost Submission Cell ---

print("Loading test data...")
X_test = pd.read_csv('../data/test_processed.csv')
test_ids = pd.read_csv('../data/test_ids.csv')
print("Test data and IDs loaded.")

best_params = search_rmse.best_params_
print(f"\nTraining a new model with best params: {best_params}")

final_ada_model = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(max_depth=5), 
    random_state=RANDOM_STATE
)

final_ada_model.set_params(**best_params)

final_ada_model.fit(X, y)
print("Model trained successfully.")

print("\nGenerating predictions...")
predictions = final_ada_model.predict(X_test)

print("Creating submission file...")
submission_df = pd.DataFrame({'Id': test_ids['Id'], 'Recovery Index': predictions})

submission_path = '../submission/submission_model7.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nAdaBoost submission file saved to: {submission_path}")
submission_df.head()

Loading test data...
Test data and IDs loaded.

Training a new model with best params: {'learning_rate': 1.0, 'n_estimators': 100}
Model trained successfully.

Generating predictions...
Creating submission file...

AdaBoost submission file saved to: ../submission/submission_model7.csv


Unnamed: 0,Id,Recovery Index
0,6253,54.800664
1,4685,22.443936
2,1732,45.953061
3,4743,32.02584
4,4522,44.765625
