# Optimizing our Models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

## Load and Process the Data
This is using the same procedure as in housing_prices.ipynb. It would make sense to extract this into its own distinct process so that we didn't have to duplicate the code.

In [2]:
df = pd.read_csv('./data/housing.csv')
df = df.dropna()

In [3]:
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42)

In [5]:
# Define the categories in the desired order for encoding
categories = [['INLAND', '<1H OCEAN', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']]
encoder = OrdinalEncoder(categories=categories)
X_train['ocean_proximity'] = encoder.fit_transform(
    X_train[['ocean_proximity']])

In [6]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
19566,-120.96,37.61,23.0,3497.0,887.0,2467.0,816.0,1.9444,0.0
7292,-118.22,33.98,34.0,2225.0,753.0,2980.0,736.0,1.6685,1.0
17618,-121.94,37.28,27.0,2859.0,464.0,1144.0,430.0,5.0822,1.0
17518,-121.91,37.34,35.0,2189.0,607.0,1193.0,562.0,2.8042,1.0
5172,-118.28,33.95,41.0,835.0,208.0,707.0,192.0,1.4103,1.0


In [7]:
y_train.head()

19566     93400.0
7292     128800.0
17618    327500.0
17518    240900.0
5172      86200.0
Name: median_house_value, dtype: float64

In [8]:
# Encode the ocean_proximity column as well
X_test['ocean_proximity'] = encoder.fit_transform(X_test[['ocean_proximity']])

## Optimize our Models with Grid Search
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

The function below performs a grid search. To use it, we will define the parameters (including the type of model we wish to optimize).

In [10]:
def perform_grid_search(parameters):
    results = {}
    for name, setup in parameters.items():
        grid_search = GridSearchCV(
            setup['model'], setup['params'], cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        results[name] = {
            'best_score': grid_search.best_score_,
            'best_params': grid_search.best_params_
        }
    return results

### Decision Tree

In [None]:
decision_tree_params = {
    'DecisionTree': {
        'model': DecisionTreeRegressor(random_state=42),
        'params': {
            'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            #'splitter': ['best', 'random'],
            'max_depth': [None, 10],
            'min_samples_split': [2, 3],
            'min_samples_leaf': [1, 14, 20],
            'min_weight_fraction_leaf': [0.0, 0.01],
            'max_features': [None, 'sqrt', 'log2'],
            'max_leaf_nodes': [None, 100, 150],
            'min_impurity_decrease': [0.0, 0.01],
            'ccp_alpha': [0.0, 0.01, 0.02]
        }
    }
}

In [11]:
perform_grid_search(decision_tree_params)

{'DecisionTree': {'best_score': -3427705884.6115246,
  'best_params': {'ccp_alpha': 0.0,
   'criterion': 'poisson',
   'max_depth': None,
   'max_features': None,
   'max_leaf_nodes': None,
   'min_impurity_decrease': 0.01,
   'min_samples_leaf': 14,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0}}}

### Random Forest

In [12]:
random_forest_params = {
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [10, 100, 400],
            'max_features': [None, 'sqrt', 'log2']
        }
    }
}

In [13]:
perform_grid_search(random_forest_params)

{'RandomForest': {'best_score': -2573219444.0531,
  'best_params': {'max_features': 'sqrt', 'n_estimators': 400}}}

### Gradient Boosting

In [14]:
gradient_boosting_params = {
    'HistGradientBoosting': {
        'model': HistGradientBoostingRegressor(random_state=42),
        'params': {
            'max_iter': [50, 100, 150],
            'learning_rate': [0.01, 0.1, 0.2]
        }
    }
}

In [15]:
perform_grid_search(gradient_boosting_params)

{'HistGradientBoosting': {'best_score': -2300070904.601216,
  'best_params': {'learning_rate': 0.2, 'max_iter': 150}}}

### XGBoost

In [16]:
xgboost_params = {
    'XGBoost': {
        'model': xgb.XGBRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 6, 9],
            'subsample': [0.5, 0.7, 1.0],
            'colsample_bytree': [0.5, 0.7, 1.0]
        }
    }
}

In [17]:
perform_grid_search(xgboost_params)

{'XGBoost': {'best_score': -2265082406.696934,
  'best_params': {'colsample_bytree': 1.0,
   'learning_rate': 0.1,
   'max_depth': 6,
   'n_estimators': 300,
   'subsample': 0.7}}}

# Activities

## Perform a Grid Search
Compare the results of using grid search to tune hyperparameters with the results of the corresponding models in the housing_prices.ipynb notebook. Did we improve on the results for all classifiers?

Where we got worse results, this suggests that we didn't configure our search to explore thoroughly enough. Examine the hyperparameters for each model type and use those to prepare a more thorough hyperparameter optimization.
 - https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
 - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
 - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html
 - https://xgboost.readthedocs.io/en/stable/python/python_api.html
 - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html

Experiment with hyperparameter optimization and compare your results with the default regressors we used in the ensemble methods session (housing_prices.ipynb).

Note that The MSE values returned by GridSearchCV refer to the training data. To properly evaluate the regressors, we will still want to create instances of these regressors using the optimized hyperparameters and evaluate them with the test data.

__CAUTION__: The goal is to get familiar with hyperparameter tuning, not to use electricity for hours and hours training models. It is easy to get carried away!

## Activity: Examine Other Solutions
Examine the solutions that other people have developed for the California Housing Prices dataset with a view to understanding how they have tried to improve their models. For example, user OMARAYMANATIA achieves an MSE of 48911 at https://www.kaggle.com/code/omaraymanatia/california-housing-prices-prediction.

Kaggle conveniently shows you the rating of users (OMARAYMANATIA is a rated Kaggle Expert because of his notebooks) so we can focus on the work of the more experienced Kaggle users.

From your explorations, compile a list of questions and ideas to try. Sort these by priority, taking into account how much work is involved (effort) and estimated likely payoff (value). Low-effort high-value actions are a great place to continue building your knowledge and skills. What list did you come up with?



## Optional Activity: Experiment with Other Optimizers 
Scikit-learn has other hyperparameter optimizers and there are additional approaches provided by other modules. You may find that you get better results more quickly with those. For example, the Optuna code below found a set of hyperparameter values in just over 30 seconds that were as good as the ones that took GridSearch over 12 minutes to find. See https://optuna.org/ and for more information.

In [18]:
import optuna  # Optuna needs to be installed first
from sklearn.model_selection import cross_val_score

In [None]:
def objective(trial):
    # For ease of comparison, these were kept similar to values for used GridSearch above.
    # You are free to experiment further.
    params = {
        'criterion': trial.suggest_categorical('criterion', ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']),
        'max_depth': trial.suggest_categorical('max_depth', [None, 10]),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 3),
        'min_samples_leaf': trial.suggest_categorical('min_samples_leaf', [1, 14, 20]),
        'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.01),
        'max_features': trial.suggest_categorical('max_features', [None, 'sqrt', 'log2']),
        'max_leaf_nodes': trial.suggest_categorical('max_leaf_nodes', [None, 100, 150]),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.01),
        'ccp_alpha': trial.suggest_float('ccp_alpha', 0.0, 0.02)
    }

    # Create a Decision Tree Regressor with parameters to be tuned
    model = DecisionTreeRegressor(random_state=42, **params)
    # Use cross-validation to evaluate the model
    scores = cross_val_score(model, X_train, y_train,
                             cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    # Since scores are negative mean squared error, we negate it to get positive MSE
    mse = -scores.mean()
    return mse


# Create a study object that will find the hyperparameters that minimize the objective
study = optuna.create_study(direction='minimize')
# You can adjust the number of trials here
study.optimize(objective, n_trials=100)

# Fetch the best parameters and the best score achieved
best_params = study.best_params
best_score = study.best_value

print("Best score:", best_score)
print("Best parameters:", best_params)

[I 2024-11-27 21:32:31,867] A new study created in memory with name: no-name-33f96b3a-efac-4a27-a3ef-2667e1428b4a
[I 2024-11-27 21:32:31,935] Trial 0 finished with value: 4203536204.5373363 and parameters: {'criterion': 'friedman_mse', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.008674371674545113, 'max_features': None, 'max_leaf_nodes': 150, 'min_impurity_decrease': 0.005411745683945409, 'ccp_alpha': 0.0015513679472351116}. Best is trial 0 with value: 4203536204.5373363.
[I 2024-11-27 21:32:31,982] Trial 1 finished with value: 4627136453.0803995 and parameters: {'criterion': 'poisson', 'max_depth': None, 'min_samples_split': 3, 'min_samples_leaf': 14, 'min_weight_fraction_leaf': 0.0022337914006896097, 'max_features': 'sqrt', 'max_leaf_nodes': 100, 'min_impurity_decrease': 0.006135551720895618, 'ccp_alpha': 0.0014644888517110876}. Best is trial 0 with value: 4203536204.5373363.
[I 2024-11-27 21:32:32,050] Trial 2 finished with value: 41

Best score: 3427705884.6115246
Best parameters: {'criterion': 'poisson', 'max_depth': None, 'min_samples_split': 3, 'min_samples_leaf': 14, 'min_weight_fraction_leaf': 0.0005950955895181408, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.004417709694515676, 'ccp_alpha': 0.010074886099655324}
