In [None]:
import sys
import os
from collections import deque

import gymnasium as gym
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from gymnasium.envs.registration import register
from stable_baselines3 import PPO, A2C
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

import torch.nn as nn
import torch as th

sys.path.append('..')

from environment.mangoEnv import MangoEnv, MAX_AMOUNT

In [None]:
log_path = os.path.join(os.getcwd(), 'Logs')
opt_path = os.path.join(os.getcwd(), 'Opt')
log_path_for_opt = os.path.join(os.getcwd(), 'Opt', 'Logs')
checkpoints_path = os.path.join(os.getcwd(), 'Train')
#!tensorboard --logdir=Training\Logs - to watch log

In [None]:
max_episode_steps = 25000
max_episode_steps_eval = 100
fine_tune_timesteps = 2000000
callback_freq = 100000
training_alg = 'PPO' # 'PPO', 'A2C', 'REC_PPO'

# Environment parameters
env_args = {
    'render_mode': None,
    'max_episode_steps': max_episode_steps,
    'ext_mngo_price_mean': 0.1,
    'init_mngo_pool_balance': 1e6 / MAX_AMOUNT,
    'init_usdc_pool_balance': 1e5 / MAX_AMOUNT,
    'init_treasury_size_usdc': 100e6 / MAX_AMOUNT,
    'mngo_collateral_factor': 1.5,
    'arb_efficiency_factor': 0.5,
}

# Optimization parameters
N_TRIALS = 100  # Maximum number of trials
N_JOBS = 1 # Number of jobs to run in parallel
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 5  # Number of evaluations during the training
N_TIMESTEPS = int(1e5)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 60 * 3)

if training_alg in ['A2C', 'PPO']:
    DEFAULT_HYPERPARAMS = {
        "policy": "MlpPolicy",
    }
elif training_alg in ['REC_PPO']:
    DEFAULT_HYPERPARAMS = {
        "policy": "MlpLstmPolicy",
    }    

In [None]:
register(
     id="MangoEnv-v0",
     entry_point="environment.mangoEnv:MangoEnv",
     max_episode_steps=100,
)

env = gym.make("MangoEnv-v0", **env_args)
check_env(env, warn=True)

In [None]:
env = gym.make("MangoEnv-v0", **env_args)
success = 0

for jj in range(N_EVAL_EPISODES):
    obs, _ = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)
        if reward == 10.0:
            success += 1
        done = terminated or truncated
print(f"{success} / {N_EVAL_EPISODES} successes detected.")

In [None]:
eval_env = gym.make("MangoEnv-v0", **env_args)
eval_env = Monitor(eval_env, log_path)
eval_env = DummyVecEnv([lambda: eval_env])
if training_alg == 'PPO':
    model = PPO('MlpPolicy', env, verbose=0, tensorboard_log=log_path)
elif training_alg == 'REC_PPO':
    model = RecurrentPPO('MlpLstmPolicy', env, verbose=0, tensorboard_log=log_path)
else:
    model = A2C('MlpPolicy', env, verbose=0, tensorboard_log=log_path)

In [None]:
evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=False, render=False)

## Hyperparameters tuning

In [None]:
def sample_a2c_params(trial: optuna.Trial):
    """
    Sampler for A2C hyperparameters.

    :param trial: Optuna trial object
    :return: The sampled hyperparameters for the given trial.
    """
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 3, 10)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    net_arch = trial.suggest_categorical("net_arch", ["tiny", "small", "medium"]) # ["tiny", "small", "medium"]
    activation_fn = trial.suggest_categorical("fn", ["tanh", "relu"])

    # Display true values
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("n_steps", n_steps)

    if net_arch == "tiny":
        net_arch = {"pi": [64], "vf":[64]}
        trial.set_user_attr("net_arch", {"pi": [64], "vf":[64]})
    elif net_arch == "small":
        net_arch = {"pi": [64, 64], "vf": [64, 64]}
        trial.set_user_attr("net_arch", {"pi": [64, 64], "vf": [64, 64]})
    elif net_arch == "medium":
        net_arch = {"pi": [64, 64, 128], "vf": [64, 64, 128]}
        trial.set_user_attr("net_arch", {"pi": [64, 64, 128], "vf": [64, 64, 128]})

    trial.set_user_attr("activation_fn", {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn])
    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "max_grad_norm": max_grad_norm,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
        },
    }

def sample_ppo_params(trial: optuna.Trial, batch_size=64):
    """
    Sampler for PPO hyperparameters.

    :param trial: Optuna trial object
    :return: The sampled hyperparameters for the given trial.
    """
    n_steps = trial.suggest_int('n_steps', 32*batch_size, 256*batch_size, step=batch_size)
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    learning_rate = trial.suggest_float('learning_rate', 1e-6, 1, log=True)
    clip_range = trial.suggest_float('clip_range', 0.1, 0.4)
    ent_coef = trial.suggest_float('ent_coef', 0.0, 1.0)
    gae_lambda = trial.suggest_float('gae_lambda', 0.8, 0.99)
    net_arch = trial.suggest_categorical("net_arch", ["tiny", "small", "medium"])
    activation_fn = trial.suggest_categorical("fn", ["tanh", "relu"])

    # Display true values
    trial.set_user_attr("gamma_", gamma)

    if net_arch == "tiny":
        net_arch = {"pi": [64], "vf":[64]}
        trial.set_user_attr("net_arch", {"pi": [64], "vf":[64]})
    elif net_arch == "small":
        net_arch = {"pi": [64, 64], "vf": [64, 64]}
        trial.set_user_attr("net_arch", {"pi": [64, 64], "vf": [64, 64]})
    elif net_arch == "medium":
        net_arch = {"pi": [64, 64, 128], "vf": [64, 64, 128]}
        trial.set_user_attr("net_arch", {"pi": [64, 64, 128], "vf": [64, 64, 128]})

    trial.set_user_attr("activation_fn", {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn])
    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "clip_range": clip_range,
        "gae_lambda": gae_lambda,
        "ent_coef": ent_coef,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
        },
    }

def sample_reccurent_ppo_params(trial: optuna.Trial):
    """
    Sampler for recurrent PPO hyperparameters.

    :param trial: Optuna trial object
    :return: The sampled hyperparameters for the given trial.
    """
    batch_size = trial.suggest_categorical('batch_size', [2**x for x in range(5, 11)])
    n_steps = trial.suggest_int('n_steps', batch_size, 64*batch_size, step=batch_size)
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    learning_rate = trial.suggest_float('learning_rate', 1e-7, 1, log=True)
    clip_range = trial.suggest_float('clip_range', 0.1, 0.4)
    ent_coef = trial.suggest_float('ent_coef', 0.0, 1.0)
    gae_lambda = trial.suggest_float('gae_lambda', 0.8, 0.99)

    # Display true values
    trial.set_user_attr("gamma_", gamma)

    return {
        "batch_size": batch_size,
        "n_steps": n_steps,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "clip_range": clip_range,
        "gae_lambda": gae_lambda,
        "ent_coef": ent_coef,
    }

In [None]:
class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.
    
    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [None]:
def objective(trial: optuna.Trial, alg=training_alg) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()
    env = gym.make("MangoEnv-v0", **env_args)
    env = Monitor(env, log_path)
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 5, channels_order='last')
    kwargs['env'] = env
    if alg == 'PPO':
        other_params = sample_ppo_params(trial)
        kwargs.update(other_params)
        model = PPO(**kwargs)
    elif alg == 'REC_PPO':
        other_params = sample_reccurent_ppo_params(trial)
        kwargs.update(other_params)
        model = RecurrentPPO(**kwargs)
    else:
        other_params = sample_a2c_params(trial)
        kwargs.update(other_params)
        model = A2C(**kwargs)
    eval_env = gym.make("MangoEnv-v0", **env_args)
    eval_env = Monitor(eval_env, log_path)
    eval_env = DummyVecEnv([lambda: eval_env])
    eval_env = VecFrameStack(eval_env, 5, channels_order='last')
    eval_callback = TrialEvalCallback(eval_env, trial, N_EVAL_EPISODES, EVAL_FREQ, deterministic=False, verbose=0)

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback)
        model.save(os.path.join(opt_path, 'trial_{}_best_model'.format(trial.number)))
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_envs.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [None]:
# Set pytorch num threads to 1 for faster training
th.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

model_params = trial.params.copy()
model_params['gamma'] = trial.user_attrs['gamma_']

if training_alg in ['PPO', 'A2C']:
    model_params.pop('net_arch', None)
    model_params.pop('fn', None)
    model_params['policy_kwargs'] = {
        'net_arch': trial.user_attrs['net_arch'],
        'activation_fn': trial.user_attrs['activation_fn'],
    }
if training_alg == 'A2C':
    model_params.pop('exponent_n_steps', None)
    model_params['n_steps'] = trial.user_attrs['n_steps']

fig1 = plot_optimization_history(study)
# fig2 = plot_param_importances(study) # function doesn't work on windows!

fig1.show()
# fig2.show()

In [None]:
print(model_params)

## Learning

In [None]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
callback = TrainAndLoggingCallback(check_freq=callback_freq, save_path=checkpoints_path)

In [None]:
env = gym.make("MangoEnv-v0", **env_args)
env = Monitor(env, log_path)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 5, channels_order='last')

if training_alg == 'PPO':
    model = PPO('MlpPolicy', env, verbose=0, tensorboard_log=log_path, **model_params)
elif training_alg == 'REC_PPO':
    model = RecurrentPPO('MlpLstmPolicy', env, verbose=0, tensorboard_log=log_path, **model_params)
else:
    model = A2C('MlpPolicy', env, verbose=0, tensorboard_log=log_path, **model_params)

print(f"Coefficients from trial {trial.number} loaded.")
model.load(os.path.join(opt_path, f"trial_{trial.number}_best_model.zip"))

In [None]:
eval_env = gym.make("MangoEnv-v0", **env_args)
eval_env = Monitor(eval_env, log_path)
eval_env = DummyVecEnv([lambda: eval_env])
eval_env = VecFrameStack(eval_env, 5, channels_order='last')
evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=False, render=False)

In [None]:
env = gym.make("MangoEnv-v0", **env_args)
env = Monitor(env, log_path)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 5, channels_order='last')
success = 0

for jj in range(N_EVAL_EPISODES):
    obs = env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs, deterministic=False)
        obs, reward, done, info = env.step(action)
        if reward == 10.0:
            success += 1
print(f"{success} / {N_EVAL_EPISODES} successes detected.")

In [None]:
model.learn(total_timesteps=fine_tune_timesteps, progress_bar=True, callback=callback)

In [None]:
eval_env = gym.make("MangoEnv-v0", **env_args)
eval_env = Monitor(eval_env, log_path)
eval_env = DummyVecEnv([lambda: eval_env])
eval_env = VecFrameStack(eval_env, 5, channels_order='last')
evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=False, render=False)

In [None]:
env = gym.make("MangoEnv-v0", **env_args)
env = Monitor(env, log_path)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 5, channels_order='last')
obs = env.reset()
total_reward = 0
done = False
d = deque(maxlen=2)
attack = 0

for ii in range(max_episode_steps_eval):
    action, _ = model.predict(obs, deterministic=False)
    print(f"action {ii}: {env.venv.env_method('convert_action_rl_to_human', action.squeeze())[0]}, mngo_spot_price: {env.venv.get_attr('amm')[0].get_price()}")
    d.appendleft(action.squeeze()[0])
    obs, reward, done, info = env.step(action)
    if d == deque([2, 0]) and reward > 0:
        print(f"Attack on treasury!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        attack += 1
    if reward > 0:
        print(f"reward at {ii} step: {reward.squeeze()}!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    else:
        print(f"reward at {ii} step: {reward.squeeze()}")
    total_reward += reward
    if done.squeeze():
        break
    
print(f"Done: {done.squeeze()}, total reward: {total_reward.squeeze()}, attacks: {attack}")
print(f"Treasury: {env.venv.get_attr('mango')[0].treasury_usdc}")
print(f"Health_factor: {env.venv.get_attr('mango')[0].get_user_health_factor()}, debt is bad: {env.venv.get_attr('mango')[0].debt_is_bad()}")

In [None]:
env = gym.make("MangoEnv-v0", **env_args)
env = Monitor(env, log_path)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 5, channels_order='last')
success = 0

for jj in range(N_EVAL_EPISODES):
    obs = env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs, deterministic=False)
        obs, reward, done, info = env.step(action)
        if reward == 10.0:
            success += 1
print(f"{success} / {N_EVAL_EPISODES} successes detected.")