In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from pprint import pprint
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV

%matplotlib inline

In [None]:
# read in data
df = pd.read_csv("../../data/otto-group-product-classification-challenge/train.csv")
df.drop(columns=["id"], inplace=True)

In [None]:
# Split into features and label
X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values

In [None]:
# label encode string class values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
# use grid search to find best number of trees
clf = XGBClassifier(n_estimators=200, max_depth=6)
subsample_list = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
param_grid = {
    "subsample": subsmaple_list, 
}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=89)
cv = GridSearchCV(clf, param_grid, cv=kfold, scoring="neg_log_loss", n_jobs=-1)
t0 = time()
cv.fit(X, y)
print("[INFO] The time taken to search is: %f" % (time() - t0))

In [None]:
# print the process
print(
    "[INFO] The best score is: %f, where the best parameter combination is:"
    % (cv.best_score_)
)
pprint(cv.best_params_)
print()

means = cv.cv_results_["mean_test_score"]
stds = cv.cv_results_["std_test_score"]
params = cv.cv_results_["params"]
for mean, std, param in zip(means, stds, params):
    print("%f (+/- %f) with %s" % (mean, std, param))

In [None]:
# visualize the process
fig, ax = plt.subplots(figsize=(8, 6))
ax.errorbar(learning_rate_list, means, yerr=stds)
plt.xlabel("subsample")
plt.ylabel("Log Loss")
plt.title("XGBoost subsample vs Log Loss")
plt.grid()
plt.show()