In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [24,16]

### Why tune your model? 

#### Untuned model example

In [3]:
import xgboost as xgb

housing_data = pd.read_csv('data/ames_housing_trimmed_processed.csv')
X, y = housing_data[housing_data.columns.tolist()[:-1]], housing_data[housing_data.columns.tolist()[-1]]
housing_dmatrix = xgb.DMatrix(data=X, label=y)

In [4]:
untuned_params={'objective':'reg:squarederror'}
untuned_cv_results_rmse = xgb.cv(dtrain=housing_dmatrix,
                                params=untuned_params,
                                nfold=4, metrics="rmse",
                                as_pandas=True, seed=123)
print('Untuned rmse: %f' %((untuned_cv_results_rmse["test-rmse-mean"]).tail(1)))

Untuned rmse: 34624.229980


In [5]:
tuned_params = {"objective":"reg:squarederror", "colsample_bytree": 0.3,
               "learning_rate":0.1, "max_depth":5}

tuned_cv_results_rmse = xgb.cv(dtrain=housing_dmatrix, 
                              params=tuned_params, nfold=4, 
                              num_boost_round=200, metrics="rmse",
                              as_pandas=True, seed=123)
print("Tuned rmse: %f" %((tuned_cv_results_rmse['test-rmse-mean']).tail(1)))

Tuned rmse: 31111.041992


In [6]:
# Create the DMatrix: housing_matrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree: params
params = {'objective':'reg:squarederror', 'max_depth':3}

# Create list of number of boosting rounds
num_rounds = [5, 10, 15]

# Empty list to store final round rmse pr XGBoost model
final_rmse_per_round = []

# Iterate over num_rounds and build one model per num_boost_round parameter
for curr_num_rounds in num_rounds:

    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, 
                       nfold=3, num_boost_round=curr_num_rounds,
                       metrics='rmse', as_pandas=True, 
                       seed=123)
    
    final_rmse_per_round.append(cv_results['test-rmse-mean'].tail().values[-1])
    
# Print the resultant DataFrame
num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
print(pd.DataFrame(num_rounds_rmses, columns=['num_boosting_rounds','rmse']))    

   num_boosting_rounds          rmse
0                    5  50903.300781
1                   10  34774.192708
2                   15  32895.098307


In [7]:
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree: params
params = {"objective":"reg:squarederror", "max_depth":4}

# Perform cross-validation with early stopping: cv_results
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params,
                   metrics='rmse', nfold=3, 
                   early_stopping_rounds=10,
                   num_boost_round=50, as_pandas=True,
                   seed=123)

# Print cv_results
print(cv_results.iloc[-3:,:])

    train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
47     11071.315430      604.089695    30732.664062    1966.998275
48     10950.778646      574.862348    30712.240885    1957.751118
49     10824.865560      576.666458    30720.854818    1950.511520


### Common tree tunable parameters
- **learning rate:** learning rate/eta
- **gamma:** min loss reduction to create new tree split
- **lambda:** L2 (lasso) reg on leaf weights
- **alpha:** L1 (ridge) ref on leaf weights
- **max_depths:** max depth per tree
- **subsample:** % samples used per tree
- **colsample_bytree** % features used per tree 

### Linear tunable parameters
- **lambda:** L2 reg on weights
- **alpha:** L1 reg on weights
- **lambda_bias:** L2 reg term on bias
- You can also tune the number of estimators used for both base model types.

In [8]:
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree (boosting round)
params = {"objective":"reg:squarederror", "max_depth":3}

# Create list of eta values and empty list to store final round rmse per xgboost model
eta_vals = [0.001, 0.01, 0.1]
best_rmse = []

# Symetrically vary the eta
for curr_val in eta_vals:
    
    params['eta'] = curr_val
    
    # Perform cross-validation: cv_results 
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params,
                       metrics='rmse', nfold=3,
                       early_stopping_rounds=5, 
                       num_boost_round=10, as_pandas=True,
                       seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results['test-rmse-mean'].tail().values[-1])
    
# Print the resultant DataFrame
print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=['eta','best_rmse']))

     eta      best_rmse
0  0.001  195736.401042
1  0.010  179932.177083
2  0.100   79759.414063


In [9]:
# Create your housing DMatrix
housing_dmatrix = xgb.DMatrix(data=X,label=y)

# Create the parameter dictionary
params = {"objective":"reg:squarederror"}

# Create a list of max_depth values
max_depths = [2,5,10,20]
best_rmse = []

# Systematically vary the maxabs
for curr_value in max_depths:
    
    params['max_depth'] = curr_value
    
    # Perform cross_validation
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params,
                       metrics='rmse', nfold=3,
                       early_stopping_rounds=5,
                       num_boost_round=10, 
                       as_pandas=True,
                       seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results['test-rmse-mean'].tail().values[-1])
    
# Print the resultant DataFrame
print(pd.DataFrame(list(zip(max_depths, best_rmse)), columns=['max_depth', 'best_rmse']))

   max_depth     best_rmse
0          2  37044.029948
1          5  33210.039063
2         10  34503.430990
3         20  34847.684896


In [10]:
# Create your housing DMatrix
housing_dmatrix = xgb.DMatrix(data=X,label=y)

# Create the parameter dictionary
params={"objective":"reg:squarederror","max_depth":3}

# Create list of hyperparameter values: colsample_bytree_vals
colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
best_rmse = []

# Systematically vary the hyperparameter value 
for curr_val in colsample_bytree_vals:

    params['colsample_bytree'] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
                 num_boost_round=10, early_stopping_rounds=5,
                 metrics="rmse", as_pandas=True, seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree","best_rmse"]))

   colsample_bytree     best_rmse
0               0.1  48193.453125
1               0.5  36013.541015
2               0.8  35932.962891
3               1.0  35836.042969


### Review of grid search and random search

#### Grid search: review
- Search exhaustively over a given set of hyperparameters, once per set of hyperparameters
- Number of models = number of distinct values per hyperparameter multiplied across each hyperparameter
- Pick final model hyperparameter values that give best cross-validated evaluation metric value

In [11]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import GridSearchCV

housing_data = pd.read_csv('data/ames_housing_trimmed_processed.csv')
X, y = housing_data[housing_data.columns.tolist()[:-1]],\
       housing_data[housing_data.columns.tolist()[-1]]
housing_dmatrix = xgb.DMatrix(data=X, label=y)

gbm_param_grid = {'learning_rate': [0.01, 0.1, 0.5, 0.9],
                  'n_estimators': [200],
                  'subsample': [0.3,0.5,0.9]}

gbm = xgb.XGBRegressor(objective='reg:squarederror')
grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid,
                        scoring='neg_mean_squared_error', cv=4, verbose=1)
grid_mse.fit(X,y)

print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ",np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   27.9s finished


Best parameters found:  {'learning_rate': 0.1, 'n_estimators': 200, 'subsample': 0.5}
Lowest RMSE found:  28410.039476552454


### Random search: review
- Create a (possibly infinite) range of hyperparameter values per hyperparameter that you would like to search over
- Set the number of iterations you would like for the random search to continue 
- During each iteration, randomly draw a value in the range of specified values for each hyperparameter searched over and train/evaluate a model with those hyperparameters
- After you've reached the maximum number of iterations, select the hyperparameter configuration with the best evaluated score

In [12]:
from sklearn.model_selection import RandomizedSearchCV

gbm_param_grid = {'learning_rate': np.arange(0.05,1.05,0.05),
                 'n_estimators': [200],
                 'subsample': np.arange(0.05,1.05,0.05)}

gbm = xgb.XGBRegressor(objective='reg:squarederror')

randomized_mse = RandomizedSearchCV(estimator=gbm, param_distributions=gbm_param_grid,
                                   n_iter=25, scoring='neg_mean_squared_error',
                                   cv=4, verbose=1)

randomized_mse.fit(X,y)

print('Best parameters found: ', randomized_mse.best_params_)
print('Lowest RMSE found: ', np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   57.6s finished


Best parameters found:  {'subsample': 0.35000000000000003, 'n_estimators': 200, 'learning_rate': 0.15000000000000002}
Lowest RMSE found:  27959.816747233395


In [14]:
# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor(objective='reg:squarederror')

# Perform grid search: grid_mse
grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid,
                             scoring='neg_mean_squared_error', cv=4,
                             verbose=1)


# Fit grid_mse to the data
grid_mse.fit(X,y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    2.0s finished


Best parameters found:  {'colsample_bytree': 0.7, 'max_depth': 5, 'n_estimators': 50}
Lowest RMSE found:  29916.562522854438
