# Grid search notebook
author: Gonzalo Miranda Cabrera

objective: perform a grid search around the hyperparameters found in the randomized search to find an even better combination.

summary:
1. read the data and slpit into test and train.
2. define the pipilines to be trained.
3. prepare the grid search with the hyperparameters found in the randomized search.
4. train the pipelines.
5. show best scores in training data.
6. save results to file in grid_search_results folder.
7. save models.

In [1]:
import sys
import pandas as pd
from sklearn.svm import SVR
from joblib import load, dump
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


## Split into train and test sets

In [2]:
data = pd.read_csv("data.csv", index_col="order_id")

y = data.pop("total_minutes").to_numpy()
X = data.to_numpy()

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((5308, 37), (2275, 37), (5308,), (2275,))

## Define pipelines

### Pipeline composition
1. Standard scaler -> standardizes the data with mean and std
2. PCA -> Principal component analysis (change of basis on the input data)
3. ML Model

In [3]:
svm_pipeline = Pipeline([("scaler", StandardScaler()), ("pca", PCA()), ("svm", SVR())])
rfr_pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA()), ("rfr", RandomForestRegressor())]
)
xgb_pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA()), ("xgb", XGBRegressor())]
)


In [4]:
svm_random_search = load("rand_search_results/svm_random_search")
rfr_random_search = load("rand_search_results/rfr_random_search")
xgb_random_search = load("rand_search_results/xgb_random_search")


## Grid search

### SVM

In [5]:
svm_random_search.best_params_

{'svm__shrinking': False,
 'svm__kernel': 'rbf',
 'svm__gamma': 'scale',
 'svm__epsilon': 0.9,
 'svm__C': 13,
 'pca__n_components': 20}

In [6]:
svm_params = {
    "pca__n_components": [19, 20, 21],
    "svm__C": [12, 13, 14],
    "svm__epsilon": [0.85, 0.9, 0.95],
    "svm__gamma": ["scale", "auto"],
    "svm__kernel": ["linear", "sigmoid", "rbf"],
    "svm__shrinking": [True, False],
}

svm_grid_search = GridSearchCV(
    estimator=svm_pipeline,
    param_grid=svm_params,
    scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error", "r2"],
    refit="neg_root_mean_squared_error",
    cv=3,
    n_jobs=-1,
    verbose=1,
)


### Random forest regressor

In [7]:
rfr_random_search.best_params_

{'rfr__n_estimators': 500,
 'rfr__min_samples_split': 4,
 'rfr__min_samples_leaf': 9,
 'rfr__max_features': 'auto',
 'rfr__max_depth': 80,
 'rfr__bootstrap': True,
 'pca__n_components': 30}

In [8]:
rfr_params = {
    "pca__n_components": [29, 30, 31],
    "rfr__n_estimators": [475, 500, 525],
    "rfr__max_features": ["auto", "sqrt", "log2"],
    "rfr__max_depth": [75, 80, 85],
    "rfr__min_samples_split": [3, 4, 5],
    "rfr__min_samples_leaf": [8, 9, 10],
    "rfr__bootstrap": [True, False],
}
rfr_grid_search = GridSearchCV(
    estimator=rfr_pipeline,
    param_grid=rfr_params,
    scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error", "r2"],
    refit="neg_root_mean_squared_error",
    cv=3,
    n_jobs=-1,
    verbose=1,
)


### XGB Regressor

In [9]:
xgb_random_search.best_params_

{'xgb__subsample': 0.3,
 'xgb__objective': 'reg:squarederror',
 'xgb__n_jobs': 2,
 'xgb__n_estimators': 1000,
 'xgb__min_child_weight': 7,
 'xgb__max_depth': 7,
 'xgb__learning_rate': 0.005,
 'xgb__colsample_bytree': 0.8,
 'pca__n_components': 20}

In [10]:
xgb_params = {
    "pca__n_components": [19, 20, 21],
    "xgb__objective": ["reg:squarederror", "reg:squaredlogerror"],
    "xgb__learning_rate": [0.0025, 0.005, 0.0075],
    "xgb__max_depth": [6, 7, 8],
    "xgb__min_child_weight": [7, 8, 9],
    "xgb__subsample": [0.25, 0.3, 0.35],
    "xgb__colsample_bytree": [0.75, 0.8, 0.85],
    "xgb__n_estimators": [975, 1000, 1025],
    "xgb__n_jobs": [2],
}
xgb_grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgb_params,
    scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error", "r2"],
    refit="neg_root_mean_squared_error",
    cv=3,
    n_jobs=6,
    verbose=1,
)


### Training

In [11]:
print("svm started grid search")
svm_grid_search.fit(x_train, y_train)


svm started grid search
Fitting 3 folds for each of 324 candidates, totalling 972 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA()), ('svm', SVR())]),
             n_jobs=-1,
             param_grid={'pca__n_components': [19, 20, 21],
                         'svm__C': [12, 13, 14],
                         'svm__epsilon': [0.85, 0.9, 0.95],
                         'svm__gamma': ['scale', 'auto'],
                         'svm__kernel': ['linear', 'sigmoid', 'rbf'],
                         'svm__shrinking': [True, False]},
             refit='neg_root_mean_squared_error',
             scoring=['neg_mean_absolute_error', 'neg_root_mean_squared_error',
                      'r2'],
             verbose=1)

In [12]:
print("random forest started grid search")
rfr_grid_search.fit(x_train, y_train)


random forest started grid search
Fitting 3 folds for each of 1458 candidates, totalling 4374 fits


KeyboardInterrupt: 

In [None]:
print("xgb regressor started grid search")
xgb_grid_search.fit(x_train, y_train)


In [None]:
# Best scores on training data
print(f"svm: {svm_grid_search.best_score_}")
print(f"rfr: {rfr_grid_search.best_score_}")
print(f"xgb: {xgb_grid_search.best_score_}")


## Results

In [None]:
# mae and mse on test data

original_stdout = sys.stdout  # Save a reference to the original standard output

models = [
    ("svm", svm_grid_search),
    ("rfr", rfr_grid_search),
    ("xgb", xgb_grid_search),
]


def ensemble_predict(pred):
    return (
        sum(
            [
                svm_grid_search.best_estimator_.predict(pred),
                rfr_grid_search.best_estimator_.predict(pred),
                xgb_grid_search.best_estimator_.predict(pred),
            ]
        )
        / 3
    )


with open("grid_search_results/results.txt", "w") as f:
    sys.stdout = f  # Change the standard output to the file we created.

    # Mean absolute error
    for model in models:
        print(
            f"{model[0]} mae",
            f"{mean_absolute_error(y_true=y_test, y_pred=model[1].best_estimator_.predict(x_test))}",
        )

    print(
        f"ensemble mae: {mean_absolute_error(y_true=y_test, y_pred=ensemble_predict(x_test))}"
    )

    # Root mean squared error
    for model in models:
        print(
            f"{model[0]} rmse",
            f"{mean_squared_error(y_true=y_test, y_pred=model[1].best_estimator_.predict(x_test),squared=False)}",
        )
    print(
        f"ensemble rmse: {mean_squared_error(y_true=y_test, y_pred=ensemble_predict(x_test), squared=False)}"
    )

    # r2 score
    for model in models:
        print(
            f"{model[0]} r2",
            f"{r2_score(y_true=y_test, y_pred=model[1].best_estimator_.predict(x_test))}",
        )
    print(
        f"ensemble r2: {r2_score(y_true=y_test, y_pred=ensemble_predict(x_test))}"
    )

    sys.stdout = original_stdout  # Reset the standard output to its original value


## Model saving

In [None]:
# Model saving
dump(svm_grid_search, "grid_search_results/svm_grid_search")
dump(svm_grid_search.best_estimator_, "grid_search_results/svm_best_model")
dump(rfr_grid_search, "grid_search_results/rfr_grid_search")
dump(rfr_grid_search.best_estimator_, "grid_search_results/rfr_best_model")
dump(xgb_grid_search, "grid_search_results/xgb_grid_search")
dump(xgb_grid_search.best_estimator_, "grid_search_results/xgb_best_model")
