In [1]:
# Load Data
import pandas as pd
import ray

ray.shutdown()
ray.init(object_store_memory=4000000000) # set object store memory to 4GB

data = pd.read_pickle('C:\\Users\\manue\\switchdrive\\Mutual Funds Project\\data\\pickle_files\\full_dataset.pkl')
sample = data.sample(101, random_state=1)

# tansform 'Rating' from Categorical to float
import numpy as np
sample["Rating"] = pd.to_numeric(sample["Rating"], errors='coerce')

# drop all rows with inf/-inf values!
import numpy as np
sample = sample[(sample != np.inf).all(axis=1)]
sample = sample[(sample != -np.inf).all(axis=1)]

#get rid of whitespace to draw tree later
Eq_Stylebox = sample['Eq_Stylebox_Long'].astype('object').replace(' ','_', regex=True)
sample['Eq_Stylebox_Long'] = Eq_Stylebox.astype('category')

# shifting target variable to predict next month
sample['returns'] = sample.returns.shift(-1)
sample = sample.drop(sample.tail(1).index)


X = sample.drop('returns', axis=1)
y = sample['returns']

# create dummies in case of categorical data
dummy_needed = [#'Rating',
                'Financial_Health_Grade_Long',
                 'Growth_Grade_Long',
                 'Profitability_Grade_Long',
                 'Eq_Stylebox_Long']

X = pd.get_dummies(X, columns=dummy_needed)

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 26)
#X_train = ray.put(X_train)
#y_train = ray.put(y_train)

2023-04-14 10:15:04,889	INFO worker.py:1553 -- Started a local Ray instance.


In [None]:
%%time
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials
from ray import tune
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.search.hyperopt import HyperOptSearch
from sklearn.model_selection import cross_val_score, KFold
import numpy as np


# Define the search space for hyperparameters
search_space = {
    "learning_rate": hp.loguniform("learning_rate", np.log(0.0001), np.log(0.9)),
    "max_depth": hp.choice("max_depth", range(1, 16)),
    "n_estimators": hp.choice("n_estimators", range(100, 10000)),
    "subsample": hp.uniform("subsample", 0.5, 1),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "colsample_bylevel": hp.uniform("colsample_bylevel", 0.5, 1),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(100)),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(100)),
}


kfolds = KFold(n_splits=5, shuffle=True, random_state=26)

# Define the objective function to optimize
def objective(config):
    model = xgb.XGBRegressor(**config, n_jobs=-1)
    score = cross_val_score(model, X=ray.get(X_train), y=ray.get(y_train), scoring="neg_mean_squared_error", cv=kfolds)
    rmse = np.sqrt(-np.mean(score))
    tune.report(rmse=rmse)

# Define the search algorithm
search_alg = HyperOptSearch(space=search_space, metric="rmse", mode="min")
# to limit number of cores, uncomment and set max_concurrent 
# algo = ConcurrencyLimiter(algo, max_concurrent=10)
scheduler = AsyncHyperBandScheduler()

# Define the hyperparameter tuning trials object
trials = Trials()

# Define the configuration for Ray Tune
config = {
    'num_samples': 10,
    'config': search_space,
    'search_alg': search_alg,
    'scheduler': scheduler,
    'resources_per_trial': {'cpu': 1},
    'metric': 'rmse',
    'mode': 'min',
    'verbose': 1,
    'name': 'xgboost_tuning',
    'stop': {'training_iteration': 10},
    'local_dir': './ray_results',
}

# Start the hyperparameter tuning using Ray Tune
analysis = tune.run(objective, **config)

# Print the best hyperparameters found during the search
best_params = analysis.get_best_config(metric='rmse')
print("Best hyperparameters found:")
print(best_params)

### Distributed Ray XGB
https://github.com/ray-project/xgboost_ray

In [None]:
import time
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch
from xgboost_ray import RayDMatrix, RayParams, train
from ray import tune

def train_xgboost(config, checkpoint_dir=None):
    start_time = time.time()
    
    train_set = RayDMatrix(X_train, y_train)
    test_set = RayDMatrix(X_test, y_test)
    
    evals_result = {}
    
    # train the model
    bst = train(
        params=config,
        dtrain=train_set,
        evals_result=evals_result,
        evals=[(test_set, "eval")],
        verbose_eval=False,
        num_boost_round=5, #equivalent to 'epochs'
        ray_params=RayParams(num_actors=4, cpus_per_actor=2)) #parameters for parallelism
    
    model_path = 'model.xgb'
    bst.save_model(model_path)
    print(f'total time taken: {time.time()-start_time}')
    print('Final rmse: {:.4f}'.format(
    evals_result["eval"]["rmse"][-1]))
    
    return bst
    

# Define the search algorithm and scheduler
#### NOT SURE IF NEEDED OR SUPPORTED IN XGB_RAY! maybe uncomment here and in config
search_alg = HyperOptSearch()
scheduler = ASHAScheduler(max_t=10, grace_period=1)

# Specify the hyperparameter search space.
config = {
    "tree_method": "approx",
    "objective": "reg:squarederror", # Use regression objective
    "eval_metric": "rmse", # Set the evaluation metric to RMSE
    "learning_rate": tune.loguniform(1e-4, 0.9),
    "subsample": tune.uniform(0.8, 1.0),
    "colsample_bytree": tune.uniform(0.8, 1.0),
    "colsample_bylevel": tune.uniform(0.8, 1.0),
    "max_depth": tune.randint(1, 16),
    "n_estimators": tune.randint(500, 10000),
    "reg_alpha": tune.loguniform(1e-3, 100),
    "reg_lambda": tune.loguniform(1e-3, 100),
}

# Run the hyperparameter search
analysis = tune.run(
    train_xgboost,
    resources_per_trial=RayParams(num_actors=4, cpus_per_actor=2).get_tune_resources(),
    config=config,
    num_samples=5,
    search_alg=search_alg,
    scheduler=scheduler,
    metric='rmse',
    mode='min',
    verbose=1
)

best_RMSE = analysis.best_result["rmse"]
print(f'Best model parameters: {analysis.best_config}')
print(f'Best RMSE: {best_RMSE}')
print(analysis.best_config)


In [12]:
!pip freeze > requirements.txt