In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

  import pkg_resources


In [2]:
# Data
data = sns.load_dataset('healthexp')
data = pd.get_dummies(data)
X = data.drop('Life_Expectancy', axis=1)
y = data['Life_Expectancy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Search space
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
    'max_depth': hp.quniform('max_depth', 5, 50, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 32, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 32, 1)
}

def objective(params):
    params_int = {k: int(v) for k, v in params.items()}
    model = RandomForestRegressor(
        n_estimators=params_int['n_estimators'],
        max_depth=params_int['max_depth'],
        min_samples_split=params_int['min_samples_split'],
        min_samples_leaf=params_int['min_samples_leaf'],
        random_state=42,
        n_jobs=-1
    )
    cv_scores = cross_val_score(model, X_train, y_train,
                                cv=3, n_jobs=-1,
                                scoring='neg_mean_squared_error')
    rmse = np.sqrt(-cv_scores.mean())
    return {'loss': rmse, 'status': STATUS_OK}

In [4]:
# Hyperparameter optimization
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials,
            rstate=np.random.default_rng(42),
            )

100%|██████████| 50/50 [01:09<00:00,  1.38s/trial, best loss: 0.5185422809265616]


In [5]:
best_int = {k: int(v) for k, v in best.items()}
final_model = RandomForestRegressor(**best_int, random_state=42, n_jobs=-1)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print('Best params:', best_int)
print('MAE:', mae)
print('RMSE:', rmse)
print('R2:', r2)
print('MSE:', mse)

# Optional: inspect first few trial losses
print('Top 5 trial losses:', sorted([r['loss'] for r in trials.results])[:5])

Best params: {'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 727}
MAE: 0.323977639342008
RMSE: 0.392879881939725
R2: 0.9873568769958934
Top 5 trial losses: [0.5185422809265616, 0.6626430461084436, 0.6628788978782828, 0.7196034147673698, 0.7434909288895242]
