In [1]:
# Load Data
import pandas as pd
import ray

ray.shutdown()
ray.init(object_store_memory=4000000000) # set object store memory to 4GB

data = pd.read_pickle('C:\\Users\\manue\\switchdrive\\Mutual Funds Project\\data\\pickle_files\\full_dataset.pkl')
sample = data.sample(101, random_state=1)

# tansform 'Rating' from Categorical to float
import numpy as np
sample["Rating"] = pd.to_numeric(sample["Rating"], errors='coerce')

# drop all rows with inf/-inf values!
import numpy as np
sample = sample[(sample != np.inf).all(axis=1)]
sample = sample[(sample != -np.inf).all(axis=1)]

#get rid of whitespace to draw tree later
Eq_Stylebox = sample['Eq_Stylebox_Long'].astype('object').replace(' ','_', regex=True)
sample['Eq_Stylebox_Long'] = Eq_Stylebox.astype('category')

# shifting target variable to predict next month
sample['returns'] = sample.returns.shift(-1)
sample = sample.drop(sample.tail(1).index)


X = sample.drop('returns', axis=1)
y = sample['returns']

# create dummies in case of categorical data
dummy_needed = [#'Rating',
                'Financial_Health_Grade_Long',
                 'Growth_Grade_Long',
                 'Profitability_Grade_Long',
                 'Eq_Stylebox_Long']

X = pd.get_dummies(X, columns=dummy_needed)

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 26)
#X_train = ray.put(X_train)
#y_train = ray.put(y_train)

2023-04-14 10:15:04,889	INFO worker.py:1553 -- Started a local Ray instance.


### RAY XGB

In [7]:
import time
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch
from xgboost_ray import RayDMatrix, RayParams, train
from ray import tune

def train_xgboost(config, checkpoint_dir=None):
    start_time = time.time()
    
    train_set = RayDMatrix(X_train, y_train)
    test_set = RayDMatrix(X_test, y_test)
    
    evals_result = {}
    
    # train the model
    bst = train(
        params=config,
        dtrain=train_set,
        evals_result=evals_result,
        evals=[(test_set, "eval")],
        verbose_eval=False,
        num_boost_round=5, #equivalent to 'epochs'
        ray_params=RayParams(num_actors=4, cpus_per_actor=2)) #parameters for parallelism
    
    model_path = 'model.xgb'
    bst.save_model(model_path)
    print(f'total time taken: {time.time()-start_time}')
    print('Final rmse: {:.4f}'.format(
    evals_result["eval"]["rmse"][-1]))
    
    return bst
    

# Define the search algorithm and scheduler
#### NOT SURE IF NEEDED OR SUPPORTED IN XGB_RAY! maybe uncomment here and in config
search_alg = HyperOptSearch()
scheduler = ASHAScheduler(max_t=10, grace_period=1)

# Specify the hyperparameter search space.
config = {
    "tree_method": "approx",
    "objective": "reg:squarederror", # Use regression objective
    "eval_metric": "rmse", # Set the evaluation metric to RMSE
    "learning_rate": tune.loguniform(1e-4, 0.9),
    "subsample": tune.uniform(0.8, 1.0),
    "colsample_bytree": tune.uniform(0.8, 1.0),
    "colsample_bylevel": tune.uniform(0.8, 1.0),
    "max_depth": tune.randint(1, 16),
    "n_estimators": tune.randint(500, 10000),
    "reg_alpha": tune.loguniform(1e-3, 100),
    "reg_lambda": tune.loguniform(1e-3, 100),
}

# Run the hyperparameter search
analysis = tune.run(
    train_xgboost,
    resources_per_trial=RayParams(num_actors=4, cpus_per_actor=2).get_tune_resources(),
    config=config,
    num_samples=5,
    search_alg=search_alg,
    scheduler=scheduler,
    metric='rmse',
    mode='min',
    verbose=1
)

best_RMSE = analysis.best_result["rmse"]
print(f'Best model parameters: {analysis.best_config}')
print(f'Best RMSE: {best_RMSE}')
print(analysis.best_config)


0,1
Current time:,2023-04-14 10:21:58
Running for:,00:00:37.98
Memory:,10.5/15.7 GiB

Trial name,# failures,error file
train_xgboost_562f1_00000,1,"C:\Users\manue\ray_results\train_xgboost_2023-04-14_10-21-20\train_xgboost_562f1_00000_0_colsample_bylevel=0.9013,colsample_bytree=0.8510,learning_rate=0.0044,max_depth=2,n_estimators=8413,re_2023-04-14_10-21-21\error.txt"
train_xgboost_562f1_00001,1,"C:\Users\manue\ray_results\train_xgboost_2023-04-14_10-21-20\train_xgboost_562f1_00001_1_colsample_bylevel=0.9070,colsample_bytree=0.9332,learning_rate=0.0002,max_depth=1,n_estimators=769,reg_2023-04-14_10-21-28\error.txt"
train_xgboost_562f1_00002,1,"C:\Users\manue\ray_results\train_xgboost_2023-04-14_10-21-20\train_xgboost_562f1_00002_2_colsample_bylevel=0.9665,colsample_bytree=0.8353,learning_rate=0.0006,max_depth=12,n_estimators=7117,r_2023-04-14_10-21-35\error.txt"
train_xgboost_562f1_00003,1,"C:\Users\manue\ray_results\train_xgboost_2023-04-14_10-21-20\train_xgboost_562f1_00003_3_colsample_bylevel=0.8377,colsample_bytree=0.9315,learning_rate=0.0010,max_depth=8,n_estimators=3157,re_2023-04-14_10-21-42\error.txt"
train_xgboost_562f1_00004,1,"C:\Users\manue\ray_results\train_xgboost_2023-04-14_10-21-20\train_xgboost_562f1_00004_4_colsample_bylevel=0.8678,colsample_bytree=0.8076,learning_rate=0.3912,max_depth=6,n_estimators=3126,re_2023-04-14_10-21-51\error.txt"

Trial name,status,loc,colsample_bylevel,colsample_bytree,learning_rate,max_depth,n_estimators,reg_alpha,reg_lambda,subsample
train_xgboost_562f1_00000,ERROR,127.0.0.1:6620,0.901319,0.851022,0.00442476,2,8413,1.89589,0.195596,0.996528
train_xgboost_562f1_00001,ERROR,127.0.0.1:13820,0.907044,0.933161,0.000193128,1,769,1.59141,1.14684,0.900767
train_xgboost_562f1_00002,ERROR,127.0.0.1:22220,0.966529,0.835264,0.000595643,12,7117,0.15912,3.72587,0.96752
train_xgboost_562f1_00003,ERROR,127.0.0.1:23936,0.83766,0.931536,0.00102074,8,3157,0.0200599,0.0607533,0.981355
train_xgboost_562f1_00004,ERROR,127.0.0.1:23528,0.867784,0.807554,0.39121,6,3126,0.0118646,0.0033219,0.801839


2023-04-14 10:21:27,887	ERROR trial_runner.py:1062 -- Trial train_xgboost_562f1_00000: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=6620, ip=127.0.0.1, repr=train_xgboost)
  File "python\ray\_raylet.pyx", line 857, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 861, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 803, in ray._raylet.execute_task.function_executor
  File "C:\Users\manue\anaconda3\lib\site-packages\ray\_private\function_manager.py", line 674, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "C:\Users\manue\anaconda3\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\Users\manue\anaconda3\lib\site-packages\ray\tune\trainable\trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "C:\Users\manue\anaconda3\lib\site-packages\ray\t

TuneError: ('Trials did not complete', [train_xgboost_562f1_00000, train_xgboost_562f1_00001, train_xgboost_562f1_00002, train_xgboost_562f1_00003, train_xgboost_562f1_00004])

In [11]:
import time
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch
from xgboost_ray import RayDMatrix, RayParams, train
from ray import tune

def train_xgboost(config, checkpoint_dir=None):
    start_time = time.time()
    
    train_set = RayDMatrix(X_train, y_train)
    test_set = RayDMatrix(X_test, y_test)
    
    evals_result = {}
    
    # train the model
    bst = train(
        params=config,
        dtrain=train_set,
        evals_result=evals_result,
        evals=[(test_set, "eval")],
        verbose_eval=False,
        num_boost_round=5, #equivalent to 'epochs'
        ray_params=RayParams(num_actors=4, cpus_per_actor=2)) #parameters for parallelism
    
    model_path = 'model.xgb'
    bst.save_model(model_path)
    print(f'total time taken: {time.time()-start_time}')
    print('Final rmse: {:.4f}'.format(
    evals_result["eval"]["rmse"][-1]))
    
    return bst
    

# Define the search algorithm and scheduler
search_alg = HyperOptSearch()
scheduler = ASHAScheduler(max_t=10, grace_period=1)

# Specify the hyperparameter search space.
config = {
    "tree_method": "approx",
    "objective": "reg:squarederror", # Use regression objective
    "eval_metric": "rmse", # Set the evaluation metric to RMSE
    "learning_rate": tune.loguniform(1e-4, 0.9),
    "subsample": tune.uniform(0.8, 1.0),
    "colsample_bytree": tune.uniform(0.8, 1.0),
    "colsample_bylevel": tune.uniform(0.8, 1.0),
    "max_depth": tune.randint(1, 16),
    "n_estimators": tune.randint(500, 10000),
    "reg_alpha": tune.loguniform(1e-3, 100),
    "reg_lambda": tune.loguniform(1e-3, 100),
}

# Run the hyperparameter search
analysis = tune.run(
    train_xgboost,
    resources_per_trial=RayParams(num_actors=4, cpus_per_actor=2).get_tune_resources(),
    config=config,
    num_samples=5,
    search_alg=search_alg,
    scheduler=scheduler,
    metric='rmse',
    mode='min',
    verbose=1
)

best_RMSE = analysis.best_result["rmse"]
print(f'Best model parameters: {analysis.best_config}')
print(f'Best RMSE: {best_RMSE}')
print(analysis.best_config)


0,1
Current time:,2023-04-14 10:37:08
Running for:,00:00:35.40
Memory:,10.5/15.7 GiB

Trial name,# failures,error file
train_xgboost_a15b1406,1,"C:\Users\manue\ray_results\train_xgboost_2023-04-14_10-36-32\train_xgboost_a15b1406_1_colsample_bylevel=0.9488,colsample_bytree=0.9470,eval_metric=rmse,learning_rate=0.0001,max_depth=5,n_esti_2023-04-14_10-36-32\error.txt"
train_xgboost_25cc6eb8,1,"C:\Users\manue\ray_results\train_xgboost_2023-04-14_10-36-32\train_xgboost_25cc6eb8_2_colsample_bylevel=0.9566,colsample_bytree=0.9258,eval_metric=rmse,learning_rate=0.0001,max_depth=10,n_est_2023-04-14_10-36-39\error.txt"
train_xgboost_1e1be016,1,"C:\Users\manue\ray_results\train_xgboost_2023-04-14_10-36-32\train_xgboost_1e1be016_3_colsample_bylevel=0.8457,colsample_bytree=0.9737,eval_metric=rmse,learning_rate=0.4618,max_depth=6,n_esti_2023-04-14_10-36-46\error.txt"
train_xgboost_90f27f23,1,"C:\Users\manue\ray_results\train_xgboost_2023-04-14_10-36-32\train_xgboost_90f27f23_4_colsample_bylevel=0.8301,colsample_bytree=0.8863,eval_metric=rmse,learning_rate=0.0002,max_depth=6,n_esti_2023-04-14_10-36-53\error.txt"
train_xgboost_9736a753,1,"C:\Users\manue\ray_results\train_xgboost_2023-04-14_10-36-32\train_xgboost_9736a753_5_colsample_bylevel=0.9963,colsample_bytree=0.9813,eval_metric=rmse,learning_rate=0.0001,max_depth=8,n_esti_2023-04-14_10-37-00\error.txt"

Trial name,status,loc,colsample_bylevel,colsample_bytree,eval_metric,learning_rate,max_depth,n_estimators,objective,reg_alpha,reg_lambda,subsample,tree_method
train_xgboost_a15b1406,ERROR,127.0.0.1:20784,0.94884,0.946966,rmse,0.000109442,5,5072,reg:squarederror,0.0607008,0.00669629,0.843768,approx
train_xgboost_25cc6eb8,ERROR,127.0.0.1:2008,0.956606,0.925815,rmse,0.000107152,10,1949,reg:squarederror,0.00177046,17.4187,0.832601,approx
train_xgboost_1e1be016,ERROR,127.0.0.1:12052,0.845655,0.973687,rmse,0.461771,6,9242,reg:squarederror,0.00468475,0.123334,0.950859,approx
train_xgboost_90f27f23,ERROR,127.0.0.1:11988,0.830128,0.886264,rmse,0.00020366,6,7333,reg:squarederror,1.35574,1.22889,0.823849,approx
train_xgboost_9736a753,ERROR,127.0.0.1:21772,0.996313,0.981318,rmse,0.000146827,8,2578,reg:squarederror,0.00161149,24.9543,0.892406,approx


2023-04-14 10:36:39,286	ERROR trial_runner.py:1062 -- Trial train_xgboost_a15b1406: Error processing event.
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=20784, ip=127.0.0.1, repr=train_xgboost)
  File "python\ray\_raylet.pyx", line 857, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 861, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 803, in ray._raylet.execute_task.function_executor
  File "C:\Users\manue\anaconda3\lib\site-packages\ray\_private\function_manager.py", line 674, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "C:\Users\manue\anaconda3\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\Users\manue\anaconda3\lib\site-packages\ray\tune\trainable\trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "C:\Users\manue\anaconda3\lib\site-packages\ray\tun

TuneError: ('Trials did not complete', [train_xgboost_a15b1406, train_xgboost_25cc6eb8, train_xgboost_1e1be016, train_xgboost_90f27f23, train_xgboost_9736a753])