In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, r2_score
import matplotlib.pyplot as plt
from datetime import datetime
import joblib
import itertools

.NPY LOAD

In [None]:
data = np.load('D:\\data\\GBPJPY_2020_Smart_Diff_RL_V1.npy', allow_pickle=True)

In [None]:
np.random.shuffle(data)
X_data = data[:,:-1]
y_data = data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1)

.CSV LOAD

In [None]:
data = pd.read_csv('data\\WORK_Value_sample20_GBPJPY_2015_2019_Smart_Diff_Stack_V1.csv')

In [None]:
features = list(data.columns.values) #Get a list of features for "importance plot"

In [None]:
X_data.shape

In [None]:
np_data = data.to_numpy()
np.random.shuffle(np_data)

In [None]:
X_data = np_data[:,:-1]
y_data = np_data[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

FOR JUST ONE SINGLE FIT WITH KNOWN PARAMETERS

In [None]:
xgb_clf = xgb.XGBRegressor(booster='gbtree', learning_rate=0.1, n_estimators=4500, max_depth=10, gamma=0, subsample=0.8, colsample_bytree=0.9, reg_alpha=0.1, random_state=42, tree_method='gpu_hist', gpu_id=0, predictor = 'gpu_predictor', sampling_method='gradient_based')

In [None]:
xgb_clf.fit(X_train, y_train, early_stopping_rounds=20, eval_metric="rmse", eval_set=[(X_test, y_test)])

In [None]:
xgb_clf.score(X_test,y_test)

FAST PARAMETER SEARCH (without cross validation)

In [None]:
param = {
    'learning_rate': [0.1],
    'n_estimators': range(100,1100,100),
    'max_depth': range(10,35,5),
    'gamma': [0, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'reg_alpha': [0.1]
}

In [None]:
#Custom parameter search to find the best values without using CV.
import timeit
def param_fit(params):
    for values in params:
        start = timeit.default_timer()
        xgb_clf = xgb.XGBRegressor(learning_rate=values[0], n_estimators=values[1], max_depth=values[2], gamma=values[3], subsample=values[4], colsample_bytree=values[5], reg_alpha=values[6], random_state=42, tree_method='gpu_hist', gpu_id=0, predictor = 'gpu_predictor', sampling_method='gradient_based')
        xgb_clf.fit(X_train, y_train)
        score = xgb_clf.score(X_test, y_test)
        time_run = timeit.default_timer() - start
        print(f'SCORE: {score}  PARAMS: {values}  TIME: {time_run}')

In [None]:
params = list(itertools.product(*param.values())) 
param_fit(params)

SLOW (more accurate) PARAMETER SEARCH USING GRIDSEACH WITH CROSS VALIDATION 

In [None]:
param = {
    'learning_rate': [0.1],
    'n_estimators': range(100,1100,100),
    'max_depth': range(10,35,5),
    'gamma': [0, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'reg_alpha': [0.1]
}

In [None]:
grid_search = GridSearchCV(xgb_clf, param, verbose=2, cv=5)

INSPECT FIT BASED ON ABOVE SEARCH METHODS

In [None]:
print(r2_score(y_test, grid_search.best_estimator_.predict(X_test)))
print(score(y_test, grid_search.best_estimator_.predict(X_test)))

In [None]:
gs_best = grid_search.best_estimator_
gs_best

SAVE BEST FIT MODEL USING JOBLIB

In [None]:
import joblib
joblib.dump(xgb_clf, "models\\ID1506_1050_n_est4500_GJ_Smart_Diff_74pct_STACK6_2000-20.joblib_cv.dat")

INSPECT FEATURE IMPORTANCE USING PLOT

In [None]:
xgb_clf.get_booster().feature_names = features

In [None]:
fig, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(xgb_clf.get_booster(), max_num_features=90, height=0.4, ax=ax, title="Feature importance")

In [None]:
plt.bar(range(len(xgb_clf.feature_importances_)), xgb_clf.feature_importances_)
plt.show()