In [1]:
# Load Data
import pandas as pd
import ray

ray.shutdown()
ray.init(object_store_memory=4000000000) # set object store memory to 4GB

data = pd.read_pickle('C:\\Users\\manue\\switchdrive\\Mutual Funds Project\\data\\pickle_files\\full_dataset.pkl')
sample = data.sample(50001, random_state=123)

# tansform 'Rating' from Categorical to float
import numpy as np
sample["Rating"] = pd.to_numeric(sample["Rating"], errors='coerce')

# drop all rows with inf/-inf values!
import numpy as np
sample = sample[(sample != np.inf).all(axis=1)]
sample = sample[(sample != -np.inf).all(axis=1)]

#get rid of whitespace to draw tree later
Eq_Stylebox = sample['Eq_Stylebox_Long'].astype('object').replace(' ','_', regex=True)
sample['Eq_Stylebox_Long'] = Eq_Stylebox.astype('category')

# shifting target variable to predict next month
sample['returns'] = sample.returns.shift(-1)
sample = sample.drop(sample.tail(1).index)


X = sample.drop('returns', axis=1)
y = sample['returns']

# create dummies in case of categorical data
dummy_needed = [#'Rating',
                'Financial_Health_Grade_Long',
                 'Growth_Grade_Long',
                 'Profitability_Grade_Long',
                 'Eq_Stylebox_Long']

X = pd.get_dummies(X, columns=dummy_needed)

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 26)
#X_train = ray.put(X_train)
#y_train = ray.put(y_train)

2023-04-19 10:02:04,926	ERROR services.py:1169 -- Failed to start the dashboard , return code 1
2023-04-19 10:02:04,926	ERROR services.py:1194 -- Error should be written to 'dashboard.log' or 'dashboard.err'. We are printing the last 20 lines for you. See 'https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure' to find where the log file is.
2023-04-19 10:02:04,942	ERROR services.py:1238 -- 
The last 20 lines of C:\Users\manue\AppData\Local\Temp\ray\session_2023-04-19_10-02-02_390797_10616\logs\dashboard.log (it contains the error message from the dashboard): 
  File "C:\Users\manue\anaconda3\lib\site-packages\ray\dashboard\utils.py", line 121, in get_all_modules
    importlib.import_module(name)
  File "C:\Users\manue\anaconda3\lib\importlib\__init__.py", line 127, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1030, in _gcd_import
  File "<frozen importlib._bootstrap>"

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from ray import tune
from ray.tune.schedulers import HyperBandScheduler
import numpy as np

# Define the training function for Ray Tune
def train_model(config):
    # Define the neural network model
    model = nn.Sequential(
        nn.Linear(X_train.shape[1], config['dense_layer_size']),
        nn.ReLU(),
        nn.Dropout(config['dropout_rate']),
        nn.Linear(config['dense_layer_size'], config['dense_layer_size']),
        nn.ReLU(),
        nn.Dropout(config['dropout_rate']),
        nn.Linear(config['dense_layer_size'], 1)
    )

    # Define the optimizer
    optimizer = Adam(model.parameters(), lr=config['learning_rate'])

    # Train the model
    for epoch in range(config['epochs']):
        # Forward pass
        y_pred = model(X_train)

        # Compute the loss
        loss = torch.sqrt(nn.MSELoss()(y_pred, y_train))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate the model on the testing set
    with torch.no_grad():
        y_pred = model(X_test)
        test_loss = torch.sqrt(nn.MSELoss()(y_pred, y_test))

    # Return the RMSE test loss
    return {'rmse': test_loss.item()}


# Define the search space for hyperparameters
search_space = {
    'dense_layer_size': tune.choice([32, 64, 128]),
    'dropout_rate': tune.uniform(0, 0.5),
    'learning_rate': tune.loguniform(1e-5, 1e-2),
    'epochs': tune.choice([50, 100, 200])
}

# Configure Ray Tune
analysis = tune.run(
    train_model,
    config=search_space,
    num_samples=100,
    scheduler=HyperBandScheduler(),
    metric='rmse',
    mode='min'
)

# Print the best hyperparameters found
print(f'Best hyperparameters: {analysis.best_config}')