# Grid search notebook
author: Gonzalo Miranda Cabrera

objective: perform a grid search around the hyperparameters found in the randomized search to find an even better combination.

summary:
1. read the data and slpit into test and train.
2. define the pipilines to be trained.
3. prepare the grid search with the hyperparameters found in the randomized search.
4. train the pipelines.
5. show best scores in training data.
6. save results to file in grid_search_results folder.
7. save models.

In [1]:
import pandas as pd
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

from joblib import load, dump
import sys


## Split into train and test sets

In [2]:
data = pd.read_csv("data.csv", index_col="order_id")

y = data.pop("total_minutes").to_numpy()
X = data.to_numpy()

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((3911, 37), (3912, 37), (3911,), (3912,))

## Define pipelines

### Pipeline composition
1. Standard scaler -> standardizes the data with mean and std
2. PCA -> Principal component analysis (change of basis on the input data)
3. ML Model

In [3]:
svm_pipeline = Pipeline([("scaler", StandardScaler()), ("pca", PCA()), ("svm", SVR())])
rfr_pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA()), ("rfr", RandomForestRegressor())]
)
xgb_pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA()), ("xgb", XGBRegressor())]
)


## Grid search

### SVM

In [4]:
svm_random_search = load("rand_search_results/svm_random_search")
rfr_random_search = load("rand_search_results/rfr_random_search")
xgb_random_search = load("rand_search_results/xgb_random_search")


In [5]:
svm_random_search.best_params_

{'svm__shrinking': False,
 'svm__kernel': 'linear',
 'svm__gamma': 'auto',
 'svm__epsilon': 0.2,
 'svm__C': 17,
 'pca__n_components': 37}

In [6]:
svm_params = {
    "pca__n_components": [36, 37],
    "svm__C": [16, 17, 18],
    "svm__epsilon": [0.15, 0.2, 0.25],
    "svm__gamma": ["scale", "auto"],
    "svm__kernel": ["linear", "sigmoid", "rbf"],
    "svm__shrinking": [True, False],
}

svm_grid_search = GridSearchCV(
    estimator=svm_pipeline,
    param_grid=svm_params,
    scoring="neg_mean_absolute_error",
    refit="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=1,
)


### Random forest regressor

In [7]:
rfr_random_search.best_params_

{'rfr__n_estimators': 1000,
 'rfr__min_samples_split': 7,
 'rfr__min_samples_leaf': 9,
 'rfr__max_features': 'auto',
 'rfr__max_depth': 10,
 'rfr__bootstrap': True,
 'pca__n_components': 30}

In [8]:
rfr_params = {
    "pca__n_components": [29, 30, 31],
    "rfr__n_estimators": [975, 1000, 1025],
    "rfr__max_features": ["auto", "sqrt", "log2"],
    "rfr__max_depth": [5, 7, 9, 10],
    "rfr__min_samples_split": [6, 7, 8],
    "rfr__min_samples_leaf": [9, 10, 11],
    "rfr__bootstrap": [True, False],
}
rfr_grid_search = GridSearchCV(
    estimator=rfr_pipeline,
    param_grid=rfr_params,
    scoring="neg_mean_absolute_error",
    refit="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=1,
)


### XGB Regressor

In [9]:
xgb_random_search.best_params_

{'xgb__subsample': 0.3,
 'xgb__objective': 'reg:squarederror',
 'xgb__n_jobs': 2,
 'xgb__n_estimators': 600,
 'xgb__min_child_weight': 9,
 'xgb__max_depth': 6,
 'xgb__learning_rate': 0.005,
 'xgb__colsample_bytree': 0.8,
 'pca__n_components': 25}

In [10]:
xgb_params = {
    "pca__n_components": [24, 25, 26],
    "xgb__objective": ["reg:squarederror", "reg:squaredlogerror"],
    "xgb__learning_rate": [0.0025, 0.005, 0.0075],
    "xgb__max_depth": [5, 6, 7],
    "xgb__min_child_weight": [8, 9, 10],
    "xgb__subsample": [0.25, 0.3, 0.35],
    "xgb__colsample_bytree": [0.75, 0.8, 0.85],
    "xgb__n_estimators": [575, 600, 625],
    "xgb__n_jobs": [2],
}
xgb_grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgb_params,
    scoring="neg_mean_absolute_error",
    refit="neg_mean_absolute_error",
    cv=3,
    n_jobs=6,
    verbose=1,
)


### Training

In [11]:
print("svm started grid search")
svm_grid_search.fit(x_train, y_train)
print("random forest started grid search")
rfr_grid_search.fit(x_train, y_train)
print("xgb regressor started grid search")
xgb_grid_search.fit(x_train, y_train)


svm started grid search
Fitting 3 folds for each of 216 candidates, totalling 648 fits
random forest started grid search
Fitting 3 folds for each of 1944 candidates, totalling 5832 fits
xgb regressor started grid search
Fitting 3 folds for each of 4374 candidates, totalling 13122 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('xgb',
                                        XGBRegressor(base_score=None,
                                                     booster=None,
                                                     colsample_bylevel=None,
                                                     colsample_bynode=None,
                                                     colsample_bytree=None,
                                                     gamma=None, gpu_id=None,
                                                     importance_type='gain',
                                                     interaction_constraints=None,
                                                     learning_rate=None,
                                                     max_delta_step=None,
                                                     max_de

In [12]:
# Best scores on training data
print(f"svm: {svm_grid_search.best_score_:.3f}")
print(f"rfr: {rfr_grid_search.best_score_:.3f}")
print(f"xgb: {xgb_grid_search.best_score_:.3f}")


svm: -18.484
rfr: -19.196
xgb: -18.649


## Results

In [13]:
# mae and mse on test data

original_stdout = sys.stdout  # Save a reference to the original standard output

with open("grid_search_results/results.txt", "w") as f:
    sys.stdout = f  # Change the standard output to the file we created.

    print("mae and mse on test data")
    # Mean absolute error
    print(
        f"mae svm: {mean_absolute_error(y_true=y_test, y_pred=svm_grid_search.best_estimator_.predict(x_test)):.3f}"
    )
    print(
        f"mae rfr: {mean_absolute_error(y_true=y_test, y_pred=rfr_grid_search.best_estimator_.predict(x_test)):.3f}"
    )
    print(
        f"mae xgb: {mean_absolute_error(y_true=y_test, y_pred=xgb_grid_search.best_estimator_.predict(x_test)):.3f}"
    )
    ensemble_mae = mean_absolute_error(
        y_true=y_test,
        y_pred=(
            (
                svm_grid_search.best_estimator_.predict(x_test)
                + rfr_grid_search.best_estimator_.predict(x_test)
                + xgb_grid_search.best_estimator_.predict(x_test)
            )
            / 3
        ),
    )
    print(f"ensemble mae: {ensemble_mae:.3f}")

    # Mean squared error
    print(
        f"mse svm: {mean_squared_error(y_true=y_test, y_pred=svm_grid_search.best_estimator_.predict(x_test)):.3f}"
    )
    print(
        f"mse rfr: {mean_squared_error(y_true=y_test, y_pred=rfr_grid_search.best_estimator_.predict(x_test)):.3f}"
    )
    print(
        f"mse xgb: {mean_squared_error(y_true=y_test, y_pred=xgb_grid_search.best_estimator_.predict(x_test)):.3f}"
    )
    ensemble_mse = mean_squared_error(
        y_true=y_test,
        y_pred=(
            (
                svm_grid_search.best_estimator_.predict(x_test)
                + rfr_grid_search.best_estimator_.predict(x_test)
                + xgb_grid_search.best_estimator_.predict(x_test)
            )
            / 3
        ),
    )
    print(f"ensemble mse: {ensemble_mse:.3f}")

    sys.stdout = original_stdout  # Reset the standard output to its original value


## Model saving

In [14]:
# Model saving
dump(svm_grid_search, "grid_search_results/svm_grid_search")
dump(svm_grid_search.best_estimator_, "grid_search_results/svm_best_model")
dump(rfr_grid_search, "grid_search_results/rfr_grid_search")
dump(rfr_grid_search.best_estimator_, "grid_search_results/rfr_best_model")
dump(xgb_grid_search, "grid_search_results/xgb_grid_search")
dump(xgb_grid_search.best_estimator_, "grid_search_results/xgb_best_model")


['grid_search_results/xgb_best_model']