In [1]:
# Load Data
import pandas as pd
import ray

ray.shutdown()
ray.init(object_store_memory=4000000000) # set object store memory to 4GB

data = pd.read_pickle('C:\\Users\\manue\\switchdrive\\Mutual Funds Project\\data\\pickle_files\\full_dataset.pkl')
sample = data.sample(10001, random_state=26)

# tansform 'Rating' from Categorical to float
import numpy as np
sample["Rating"] = pd.to_numeric(sample["Rating"], errors='coerce')

# drop all rows with inf/-inf values!
import numpy as np
sample = sample[(sample != np.inf).all(axis=1)]
sample = sample[(sample != -np.inf).all(axis=1)]

#get rid of whitespace to draw tree later
Eq_Stylebox = sample['Eq_Stylebox_Long'].astype('object').replace(' ','_', regex=True)
sample['Eq_Stylebox_Long'] = Eq_Stylebox.astype('category')

# shifting target variable to predict next month
sample['returns'] = sample.returns.shift(-1)
sample = sample.drop(sample.tail(1).index)


X = sample.drop('returns', axis=1)
y = sample['returns']

# create dummies in case of categorical data
dummy_needed = [#'Rating',
                'Financial_Health_Grade_Long',
                 'Growth_Grade_Long',
                 'Profitability_Grade_Long',
                 'Eq_Stylebox_Long']

X = pd.get_dummies(X, columns=dummy_needed)

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 26)
X_train = ray.put(X_train)
y_train = ray.put(y_train)

2023-04-13 09:13:55,797	INFO worker.py:1553 -- Started a local Ray instance.


In [2]:
%%time
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials
from ray import tune
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.search.hyperopt import HyperOptSearch
from sklearn.model_selection import cross_val_score, KFold
import numpy as np


# Define the search space for hyperparameters
search_space = {
    "learning_rate": hp.loguniform("learning_rate", np.log(0.0001), np.log(0.9)),
    "max_depth": hp.choice("max_depth", range(1, 16)),
    "n_estimators": hp.choice("n_estimators", range(100, 10000)),
    "subsample": hp.uniform("subsample", 0.5, 1),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "colsample_bylevel": hp.uniform("colsample_bylevel", 0.5, 1),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(100)),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(100)),
}


kfolds = KFold(n_splits=5, shuffle=True, random_state=26)

# Define the objective function to optimize
def objective(config):
    model = xgb.XGBRegressor(**config, n_jobs=-1)
    score = cross_val_score(model, X=ray.get(X_train), y=ray.get(y_train), scoring="neg_mean_squared_error", cv=kfolds)
    rmse = np.sqrt(-np.mean(score))
    tune.report(rmse=rmse)

# Define the search algorithm
search_alg = HyperOptSearch(space=search_space, metric="rmse", mode="min")
# to limit number of cores, uncomment and set max_concurrent 
# algo = ConcurrencyLimiter(algo, max_concurrent=10)
scheduler = AsyncHyperBandScheduler()

# Define the hyperparameter tuning trials object
trials = Trials()

# Define the configuration for Ray Tune
config = {
    'num_samples': 10,
    'config': search_space,
    'search_alg': search_alg,
    'scheduler': scheduler,
    'resources_per_trial': {'cpu': 1},
    'metric': 'rmse',
    'mode': 'min',
    'verbose': 1,
    'name': 'xgboost_tuning',
    'stop': {'training_iteration': 10},
    'local_dir': './ray_results',
}

# Start the hyperparameter tuning using Ray Tune
analysis = tune.run(objective, **config)

# Print the best hyperparameters found during the search
best_params = analysis.get_best_config(metric='rmse')
print("Best hyperparameters found:")
print(best_params)

0,1
Current time:,2023-04-13 16:06:38
Running for:,06:52:23.74
Memory:,9.2/15.7 GiB

Trial name,status,loc,colsample_bylevel,colsample_bytree,learning_rate,max_depth,n_estimators,reg_alpha,reg_lambda,subsample,iter,total time (s),rmse
objective_6ed6efd6,TERMINATED,127.0.0.1:9340,0.761112,0.709481,0.000315163,4,604,22.6778,22.7545,0.65816,1,267.261,0.414202
objective_5c6ac5e5,TERMINATED,127.0.0.1:5616,0.98952,0.852121,0.00312927,6,4914,1.11928,1.14429,0.727295,1,16135.1,0.0616886
objective_22998d6c,TERMINATED,127.0.0.1:6436,0.878238,0.58432,0.804617,12,7868,0.0481791,20.7497,0.716793,1,10012.2,0.0732828
objective_8025eb2e,TERMINATED,127.0.0.1:18564,0.956769,0.958228,0.101735,13,4978,0.00219128,1.06636,0.518688,1,12641.3,0.0657109
objective_dc6f476c,TERMINATED,127.0.0.1:12292,0.579133,0.976691,0.00202408,14,5401,0.203785,0.259692,0.752161,1,24688.1,0.0625713
objective_704b9733,TERMINATED,127.0.0.1:3748,0.9759,0.844444,0.0444126,3,5753,24.5267,1.27402,0.774002,1,3341.81,0.0609319
objective_b8f9aa9a,TERMINATED,127.0.0.1:1068,0.552473,0.541176,0.000933521,15,1346,3.76568,40.4955,0.887033,1,1275.12,0.15488
objective_26c29e11,TERMINATED,127.0.0.1:14068,0.994862,0.587667,0.000202429,7,5332,0.00116188,0.0628445,0.571687,1,13749.9,0.178538
objective_45003793,TERMINATED,127.0.0.1:9340,0.918582,0.667051,0.104281,6,3642,4.01183,0.00200335,0.615973,1,3674.69,0.0627032
objective_6cfb2748,TERMINATED,127.0.0.1:1068,0.933403,0.517579,0.00865301,3,9136,1.26963,0.00266479,0.751606,1,11584.2,0.062231










2023-04-13 16:06:38,211	INFO tune.py:798 -- Total run time: 24743.80 seconds (24742.71 seconds for the tuning loop).


Best hyperparameters found:
{'learning_rate': 0.04441260710575024, 'max_depth': 3, 'n_estimators': 5753, 'subsample': 0.7740023907923075, 'colsample_bytree': 0.8444436871904553, 'colsample_bylevel': 0.9759004083903826, 'reg_alpha': 24.526677001061625, 'reg_lambda': 1.2740176072964426}
Wall time: 6h 52min 27s


In [3]:
best_params = analysis.get_best_config(metric='rmse')
print("Best hyperparameters found:")
print(best_params)

Best hyperparameters found:
{'learning_rate': 0.04441260710575024, 'max_depth': 3, 'n_estimators': 5753, 'subsample': 0.7740023907923075, 'colsample_bytree': 0.8444436871904553, 'colsample_bylevel': 0.9759004083903826, 'reg_alpha': 24.526677001061625, 'reg_lambda': 1.2740176072964426}
