In [1]:
!conda list

# packages in environment at C:\Users\manue\anaconda3:
#
# Name                    Version                   Build  Channel
_anaconda_depends         2022.10                  py39_2  
_ipyw_jlab_nb_ext_conf    0.1.0            py39haa95532_0  
_tflow_select             2.3.0                       mkl  
abseil-cpp                20211102.0           hd77b12b_0  
absl-py                   1.4.0                    pypi_0    pypi
aiohttp                   3.8.3            py39h2bbff1b_0  
aiohttp-cors              0.7.0                    pypi_0    pypi
aiosignal                 1.2.0              pyhd3eb1b0_0  
alabaster                 0.7.12             pyhd3eb1b0_0  
anaconda                  custom                   py39_1  
anaconda-client           1.11.0           py39haa95532_0  
anaconda-navigator        2.3.2            py39haa95532_0  
anaconda-project          0.11.1           py39haa95532_0  
ansicon                   1.89.0                   pypi_0    pypi
anyio             

In [2]:
# Load Data
import pandas as pd
import ray

#ray.shutdown()
#ray.init(object_store_memory=4000000000) # set object store memory to 4GB

data = pd.read_pickle('C:\\Users\\manue\\switchdrive\\Mutual Funds Project\\data\\pickle_files\\full_dataset.pkl')
sample = data.sample(5001, random_state=123)

# tansform 'Rating' from Categorical to float
import numpy as np
sample["Rating"] = pd.to_numeric(sample["Rating"], errors='coerce')

# drop all rows with inf/-inf values!
import numpy as np
sample = sample[(sample != np.inf).all(axis=1)]
sample = sample[(sample != -np.inf).all(axis=1)]

#get rid of whitespace to draw tree later
Eq_Stylebox = sample['Eq_Stylebox_Long'].astype('object').replace(' ','_', regex=True)
sample['Eq_Stylebox_Long'] = Eq_Stylebox.astype('category')

# shifting target variable to predict next month
sample['returns'] = sample.returns.shift(-1)
sample = sample.drop(sample.tail(1).index)


X = sample.drop('returns', axis=1)
y = sample['returns']

# create dummies in case of categorical data
dummy_needed = [#'Rating',
                'Financial_Health_Grade_Long',
                 'Growth_Grade_Long',
                 'Profitability_Grade_Long',
                 'Eq_Stylebox_Long']

X = pd.get_dummies(X, columns=dummy_needed)

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 26)
#X_train = ray.put(X_train)
#y_train = ray.put(y_train)

In [None]:
%%time
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials
from ray import tune
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.search.hyperopt import HyperOptSearch
from sklearn.model_selection import cross_val_score, KFold
import numpy as np


# Define the search space for hyperparameters
search_space = {
    "learning_rate": hp.loguniform("learning_rate", np.log(0.0001), np.log(0.9)),
    "max_depth": hp.choice("max_depth", range(1, 16)),
    "n_estimators": hp.choice("n_estimators", range(100, 10000)),
    "subsample": hp.uniform("subsample", 0.5, 1),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "colsample_bylevel": hp.uniform("colsample_bylevel", 0.5, 1),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(100)),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(100)),
}


kfolds = KFold(n_splits=5, shuffle=True, random_state=26)

# Define the objective function to optimize
def objective(config):
    model = xgb.XGBRegressor(**config, n_jobs=-1)
    score = cross_val_score(model, X=X_train, y=y_train, scoring="neg_mean_squared_error", cv=kfolds)
    rmse = np.sqrt(-np.mean(score))
    tune.report(rmse=rmse)

# Define the search algorithm
search_alg = HyperOptSearch(space=search_space, metric="rmse", mode="min")
# to limit number of cores, uncomment and set max_concurrent 
# algo = ConcurrencyLimiter(algo, max_concurrent=10)
scheduler = AsyncHyperBandScheduler()

# Define the hyperparameter tuning trials object
trials = Trials()

# Define the configuration for Ray Tune
config = {
    'num_samples': 100,
    'config': search_space,
    'search_alg': search_alg,
    'scheduler': scheduler,
    'resources_per_trial': {'cpu': 2},
    'metric': 'rmse',
    'mode': 'min',
    'verbose': 1,
    #'name': 'xgboost_tuning',
    'stop': {'training_iteration': 10},
    #'local_dir': './ray_results',
}

# Start the hyperparameter tuning using Ray Tune
analysis = tune.run(objective, **config)

# Print the best hyperparameters found during the search
best_params = analysis.get_best_config(metric='rmse')
print("Best hyperparameters found:")
print(best_params)

0,1
Current time:,2023-04-21 01:36:50
Running for:,08:34:48.61
Memory:,11.9/15.7 GiB

Trial name,status,loc,colsample_bylevel,colsample_bytree,learning_rate,max_depth,n_estimators,reg_alpha,reg_lambda,subsample,iter,total time (s),rmse
objective_61d6a884,RUNNING,127.0.0.1:19800,0.782927,0.915471,0.0117876,9,5118,0.00285809,1.95577,0.517996,,,
objective_6676662d,RUNNING,127.0.0.1:2840,0.752799,0.75437,0.000352808,6,9394,99.4328,3.64193,0.761546,,,
objective_76981051,RUNNING,127.0.0.1:12108,0.854403,0.831586,0.0281522,1,4872,57.2292,0.00105711,0.670004,,,
objective_b34c8c81,RUNNING,127.0.0.1:7708,0.603863,0.607637,0.00473776,11,5611,0.00102199,47.0737,0.907049,,,
objective_7b45143c,PENDING,,0.696291,0.553326,0.000916179,12,1086,26.9468,0.0988847,0.830991,,,
objective_0886bcfc,TERMINATED,127.0.0.1:7708,0.590517,0.771901,0.00686075,9,2391,5.2072,14.0854,0.675428,1.0,854.534,0.0589131
objective_0bae4213,TERMINATED,127.0.0.1:2840,0.980919,0.576184,0.0458388,2,2826,7.06619,16.2624,0.984471,1.0,411.592,0.0581971
objective_0bc782f4,TERMINATED,127.0.0.1:12108,0.869011,0.997973,0.000367332,10,3728,4.15531,2.44446,0.774291,1.0,3172.17,0.140353
objective_10ff3bb7,TERMINATED,127.0.0.1:12108,0.913235,0.532027,0.00136003,2,9969,3.35264,64.2438,0.709718,1.0,1979.74,0.0581349
objective_110cdcc0,TERMINATED,127.0.0.1:19800,0.500542,0.555805,0.000322859,6,6036,0.546832,1.70355,0.765272,1.0,1518.03,0.0918547


### Distributed Ray XGB
https://github.com/ray-project/xgboost_ray