# Randomized search notebook
author: Gonzalo Miranda Cabrera

objective: perform a randomized search on three regressors to find a rough idea of the parameters to use.

summary:
1. split data into train and test.
2. define the pipelines to be trained.
3. define de parameters for each pipeline and the parameters of the randomized search.
4. train the pipelines.
5. write results to a file.
6. save models.

In [25]:
import sys
import pandas as pd
from joblib import dump
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


## Split data into train and test

In [26]:
data = pd.read_csv("data.csv", index_col="order_id")

y = data.pop("total_minutes").to_numpy()
X = data.to_numpy()

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((4549, 37), (3034, 37), (4549,), (3034,))

## Define pipelines

### Pipeline composition
1. Standard scaler -> standardizes the data with mean and std
2. PCA -> Principal component analysis (change of basis on the input data)
3. ML Model

In [27]:
svm_pipeline = Pipeline([("scaler", StandardScaler()), ("pca", PCA()), ("svm", SVR())])
rfr_pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA()), ("rfr", RandomForestRegressor())]
)
xgb_pipeline = Pipeline(
    [("scaler", StandardScaler()), ("pca", PCA()), ("xgb", XGBRegressor())]
)


## Randomized search

A random search is performed to find the best hyperparameters.

The random search only lasts for 6000 iterations.

In [28]:
svm_params = {
    "pca__n_components": [5, 10, 15, 20, 25, 30, 35, 37],
    "svm__C": [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 30, 32, 35],
    "svm__epsilon": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    "svm__gamma": ["scale", "auto"],
    "svm__kernel": ["linear", "sigmoid", "rbf"],
    "svm__shrinking": [True, False],
}

rfr_params = {
    "pca__n_components": [5, 10, 15, 20, 25, 30, 35, 37],
    "rfr__n_estimators": [
        100,
        200,
        300,
        400,
        500,
        600,
        700,
        800,
        900,
        1000,
        1100,
        1200,
        1300,
        1400,
        1500,
        1600,
        1700,
        1800,
        1900,
    ],
    "rfr__max_features": ["auto", "sqrt", "log2"],
    "rfr__max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, None],
    "rfr__min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9],
    "rfr__min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9],
    "rfr__bootstrap": [True, False],
}

xgb_params = {
    "pca__n_components": [5, 10, 15, 20, 25, 30, 35, 37],
    "xgb__objective": ["reg:squarederror", "reg:squaredlogerror"],
    "xgb__learning_rate": [0.01, 0.005, 0.001],
    "xgb__max_depth": [3, 4, 5, 6, 7, 8, 9, 10],
    "xgb__min_child_weight": [3, 4, 5, 6, 7, 8, 9],
    "xgb__subsample": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    "xgb__colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    "xgb__n_estimators": [
        100,
        200,
        300,
        400,
        500,
        600,
        700,
        800,
        900,
        1000,
        1100,
        1200,
        1300,
        1400,
        1500,
        1600,
        1700,
        1800,
        1900,
    ],
    "xgb__n_jobs": [2],
}


svm_random_search = RandomizedSearchCV(
    estimator=svm_pipeline,
    param_distributions=svm_params,
    scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error", "r2"],
    refit="neg_root_mean_squared_error",
    cv=3,
    n_iter=6000,
    random_state=0,
    n_jobs=-1,
    verbose=1,
)


rfr_random_search = RandomizedSearchCV(
    estimator=rfr_pipeline,
    param_distributions=rfr_params,
    scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error", "r2"],
    refit="neg_root_mean_squared_error",
    n_iter=6000,
    cv=3,
    random_state=0,
    n_jobs=-1,
    verbose=1,
)


xgb_random_search = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=xgb_params,
    scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error", "r2"],
    refit="neg_root_mean_squared_error",
    cv=3,
    n_iter=6000,
    random_state=0,
    n_jobs=6,
    verbose=1,
)

print("svm started random search")
svm_random_search.fit(x_train, y_train)
print("random forest started random search")
rfr_random_search.fit(x_train, y_train)
print("xgb regressor started random search")
xgb_random_search.fit(x_train, y_train)


svm started random search
Fitting 3 folds for each of 10 candidates, totalling 30 fits
random forest started random search
Fitting 3 folds for each of 10 candidates, totalling 30 fits
xgb regressor started random search
Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('xgb',
                                              XGBRegressor(base_score=None,
                                                           booster=None,
                                                           colsample_bylevel=None,
                                                           colsample_bynode=None,
                                                           colsample_bytree=None,
                                                           gamma=None,
                                                           gpu_id=None,
                                                           importance_type='gain',
                                                           interaction_constraints=None,
                                                           learning_rate=No

In [29]:
# Best scores on training data
print(f"svm rmse: {svm_random_search.best_score_}")
print(f"rfr rmse: {rfr_random_search.best_score_}")
print(f"xgb rmse: {xgb_random_search.best_score_}")


svm rmse: -24.826498717941266
rfr rmse: -24.75567138070458
xgb rmse: -24.71352189051753


## Results

In [30]:
# mae and mse on test data

original_stdout = sys.stdout  # Save a reference to the original standard output

models = [
    ("svm", svm_random_search),
    ("rfr", rfr_random_search),
    ("xgb", xgb_random_search),
]


def ensemble_predict(pred):
    return (
        sum(
            [
                svm_random_search.best_estimator_.predict(pred),
                rfr_random_search.best_estimator_.predict(pred),
                xgb_random_search.best_estimator_.predict(pred),
            ]
        )
        / 3
    )


with open("rand_search_results/results.txt", "w") as f:
    sys.stdout = f  # Change the standard output to the file we created.

    # Mean absolute error
    for model in models:
        print(
            f"{model[0]} mae",
            f"{mean_absolute_error(y_true=y_test, y_pred=model[1].best_estimator_.predict(x_test))}",
        )

    print(
        f"ensemble mae: {mean_absolute_error(y_true=y_test, y_pred=ensemble_predict(x_test))}"
    )

    # Root mean squared error
    for model in models:
        print(
            f"{model[0]} rmse",
            f"{mean_squared_error(y_true=y_test, y_pred=model[1].best_estimator_.predict(x_test),squared=False)}",
        )
    print(
        f"ensemble rmse: {mean_squared_error(y_true=y_test, y_pred=ensemble_predict(x_test), squared=False)}"
    )

    # r2 score
    for model in models:
        print(
            f"{model[0]} r2",
            f"{r2_score(y_true=y_test, y_pred=model[1].best_estimator_.predict(x_test))}",
        )
    print(
        f"ensemble r2: {r2_score(y_true=y_test, y_pred=ensemble_predict(x_test))}"
    )

    sys.stdout = original_stdout  # Reset the standard output to its original value


## Model saving

In [31]:
# Model saving
dump(svm_random_search, "rand_search_results/svm_random_search")
dump(svm_random_search.best_estimator_, "rand_search_results/svm_best_model")
dump(rfr_random_search, "rand_search_results/rfr_random_search")
dump(rfr_random_search.best_estimator_, "rand_search_results/rfr_best_model")
dump(xgb_random_search, "rand_search_results/xgb_random_search")
dump(xgb_random_search.best_estimator_, "rand_search_results/xgb_best_model")
