In [2]:
# Model No. 1 XGBoost model for a9a
from xgboost import XGBClassifier
from joblib import Memory
from sklearn.datasets import load_svmlight_file
from sklearn import model_selection
from sklearn.model_selection import train_test_split,KFold
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


mem = Memory("./mycache")

@mem.cache
def get_data():
    data = load_svmlight_file('a9a.txt')
    return data[0], data[1]

X, y = get_data()

X_train, X_test, y_train, y_test = train_test_split(X,y)

le = LabelEncoder()
y_train = le.fit_transform(y_train)

kfold = model_selection.KFold(n_splits=5)
model = XGBClassifier()
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
print(results)
print("Accuracy:", results.mean()*100)

________________________________________________________________________________
[Memory] Calling __main__-C%3A-Users-ybxYB-AppData-Local-Temp-ipykernel-2389174336.get_data...
get_data()
_________________________________________________________get_data - 0.1s, 0.0min
[0.83660934 0.84930385 0.84848485 0.8495086  0.84725635]
Accuracy: 84.62325962325963


In [3]:
import xgboost as xgb

params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:squarederror',
}

gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    cv_results = xgb.cv(
        params,
        dtrain,
        metrics={'mae'},
    )
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=9, min_child_weight=5
	MAE 0.22150157693369535 for 9 rounds
CV with max_depth=9, min_child_weight=6
	MAE 0.2208557691286073 for 9 rounds
CV with max_depth=9, min_child_weight=7
	MAE 0.22094374956897012 for 9 rounds
CV with max_depth=10, min_child_weight=5
	MAE 0.22027006083297565 for 9 rounds
CV with max_depth=10, min_child_weight=6
	MAE 0.220619138947599 for 9 rounds
CV with max_depth=10, min_child_weight=7
	MAE 0.22018703737121303 for 9 rounds
CV with max_depth=11, min_child_weight=5
	MAE 0.22013897871428223 for 9 rounds
CV with max_depth=11, min_child_weight=6
	MAE 0.21952606973274613 for 9 rounds
CV with max_depth=11, min_child_weight=7
	MAE 0.21955544958041515 for 9 rounds
Best params: 11, 6, MAE: 0.21952606973274613


In [4]:
kfold = model_selection.KFold(n_splits=5)
model = XGBClassifier(max_depth=11, min_child_weight=5)
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
print(results)
print("Accuracy:", results.mean()*100)

[0.83353808 0.84725635 0.83947584 0.84234234 0.84295659]
Accuracy: 84.11138411138411


In [5]:
kfold = model_selection.KFold(n_splits=5)
model = XGBClassifier(max_depth=7, min_child_weight=5)
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
print(results)
print("Accuracy:", results.mean()*100)

[0.83435708 0.8495086  0.84254709 0.8454136  0.84725635]
Accuracy: 84.38165438165439


In [6]:
kfold = model_selection.KFold(n_splits=5)
model = XGBClassifier(tree_method = "exact")
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
print(model)
print(results)
print("Accuracy:", results.mean()*100)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
[0.83660934 0.84930385 0.84848485 0.8495086  0.84725635]
Accuracy: 84.62325962325963


In [7]:
kfold = model_selection.KFold(n_splits=5)
model = XGBClassifier(tree_method = "approx")
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
print(model)
print(results)
print("Accuracy:", results.mean()*100)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
[0.83660934 0.84930385 0.84848485 0.8495086  0.84725635]
Accuracy: 84.62325962325963


In [8]:
kfold = model_selection.KFold(n_splits=5)
model = XGBClassifier(tree_method = "hist")
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
print(model)
print(results)
print("Accuracy:", results.mean()*100)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
[0.83660934 0.84930385 0.84848485 0.8495086  0.84725635]
Accuracy: 84.62325962325963


In [9]:
kfold = model_selection.KFold(n_splits=5)
model = XGBClassifier(tree_method = "gpu_hist")
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
print(model)
print(results)
print("Accuracy:", results.mean()*100)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
[0.83660934 0.84930385 0.84848485 0.8495086  0.84725635]
Accuracy: 84.62325962325963


In [10]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_mae = float("Inf")
best_params = None

for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    cv_results = xgb.cv(
        params,
        dtrain,
        metrics={'mae'}
    )
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
        
    print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 0.21955544958041515 for 9 rounds
Best params: 1.0, 1.0, MAE: 0.21955544958041515
CV with subsample=1.0, colsample=0.9
	MAE 0.22035508879788282 for 9 rounds
Best params: 1.0, 1.0, MAE: 0.21955544958041515
CV with subsample=1.0, colsample=0.8
	MAE 0.220875164812945 for 9 rounds
Best params: 1.0, 1.0, MAE: 0.21955544958041515
CV with subsample=1.0, colsample=0.7
	MAE 0.22172634295982774 for 9 rounds
Best params: 1.0, 1.0, MAE: 0.21955544958041515
CV with subsample=0.9, colsample=1.0
	MAE 0.2197924116190039 for 9 rounds
Best params: 1.0, 1.0, MAE: 0.21955544958041515
CV with subsample=0.9, colsample=0.9
	MAE 0.22037085983194762 for 9 rounds
Best params: 1.0, 1.0, MAE: 0.21955544958041515
CV with subsample=0.9, colsample=0.8
	MAE 0.22102017096936902 for 9 rounds
Best params: 1.0, 1.0, MAE: 0.21955544958041515
CV with subsample=0.9, colsample=0.7
	MAE 0.22254947214156387 for 9 rounds
Best params: 1.0, 1.0, MAE: 0.21955544958041515
CV with subsample=0

In [11]:
kfold = model_selection.KFold(n_splits=5)
model = XGBClassifier(subsample=.8,colsample_bytree=1)
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
print(model)
print(results)
print("Accuracy:", results.mean()*100)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
[0.83415233 0.8503276  0.84438984 0.84438984 0.8454136 ]
Accuracy: 84.37346437346436


In [12]:
%time

min_mae = float("Inf")
best_params = None

for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    params['eta'] = eta
    %time 
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=999,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
    )
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

CPU times: total: 0 ns
Wall time: 0 ns
CV with eta=0.3
CPU times: total: 0 ns
Wall time: 0 ns
	MAE 0.2165687118544144 for 15 rounds

CV with eta=0.2
CPU times: total: 0 ns
Wall time: 0 ns
	MAE 0.21477705422362708 for 27 rounds

CV with eta=0.1
CPU times: total: 0 ns
Wall time: 0 ns
	MAE 0.2131775849792296 for 65 rounds

CV with eta=0.05
CPU times: total: 0 ns
Wall time: 0 ns
	MAE 0.21195645594433182 for 135 rounds

CV with eta=0.01
CPU times: total: 0 ns
Wall time: 0 ns
	MAE 0.2111709065043037 for 666 rounds

CV with eta=0.005
CPU times: total: 0 ns
Wall time: 0 ns
	MAE 0.21223862636901064 for 998 rounds

Best params: 0.01, MAE: 0.2111709065043037


In [13]:
kfold = model_selection.KFold(n_splits=5)
model = XGBClassifier(max_depth=11, min_child_weight=6, subsample=.8,colsample_bytree=1, eta = 0.01)
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
print(model)
print(results)
print("Accuracy:", results.mean()*100)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.01,
              eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=11, max_leaves=None,
              min_child_weight=6, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, ...)
[0.83579034 0.84377559 0.8450041  0.83804259 0.84275184]
Accuracy: 84.10728910728909


In [14]:
kfold = model_selection.KFold(n_splits=5)
model = XGBClassifier(eta = 0.01)
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold)
print(model)
print(results)
print("Accuracy:", results.mean()*100)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eta=0.01, eval_metric=None,
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, ...)
[0.82882883 0.84418509 0.84009009 0.83558559 0.83783784]
Accuracy: 83.73054873054872


In [15]:
model = XGBClassifier(max_depth = 7, min_child_weight=5)
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 14.65%


In [18]:
@mem.cache
def get_test():
    data = load_svmlight_file('a9a.t')
    return data[0], data[1]

X_test, y_test = get_test()

le = LabelEncoder()
y = le.fit_transform(y)

clf = XGBClassifier(max_depth = 7, min_child_weight=5)
clf.fit(X, y)

accuracy = clf.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

________________________________________________________________________________
[Memory] Calling __main__-C%3A-Users-ybxYB-AppData-Local-Temp-ipykernel-1943477679.get_test...
get_test()
_________________________________________________________get_test - 0.0s, 0.0min
Accuracy: 14.44%
