# Reinforcement learning (generator)

In this notebook, I implement a pure reinforcement learning agent. This is done to analyze the stability of training of the `generator` in the adversarial inverse reinforcement learning setting. I tested here various hyperparameters while using the perfect reward function (i.e. excluding the `discriminator` from inverse reinforcement learning) to gain better understanding of the generator's learning process.

In [None]:
import numpy as np
import pandas as pd
import torch as th
import gymnasium as gym

from datetime import datetime

from stable_baselines3.ppo import PPO, MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor

from lob.traders import RLMarketMaker
from lob.commissions import BitComCommissions, BinanceCommissions
from lob.utils import  set_plot_style, get_lot_size, get_tick_size
from rl.environments import LimitOrderBookGym

In [None]:
# Set plot style
set_plot_style()

# Set seed and random number generator
SEED = 1
RNG = np.random.default_rng(SEED)

# Set device
DEVICE = th.device("cuda" if th.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Pandas display options (show all columns)
pd.set_option("display.max_columns", None)

### Initialize the market making agent

In [None]:
# Set the parameters
EXCHANGE_NAME = "BIT.COM" 
# EXCHANGE_NAME = "BINANCE"
# EXCHANGE_NAME = "OKX"
# EXCHANGE_NAME = "GATEIO"
SYMBOL = "SOL-USDT"
PATH = "~/Projects/thesis-market-making/reinforcement-learning/data/"
ORDER_FLOW_PENALTY = 2 # Penalty for division of incoming order flow
TICK_SIZE = get_tick_size(EXCHANGE_NAME) # Tick size of the limit order book
LOT_SIZE = get_lot_size(EXCHANGE_NAME) # Lot size of the limit order book
DEPTH = 20 # Depth of the data to load to the limit order book (max 20)
EXCHANGE_TRADER_ID = "Exchange"
MAX_STEPS = 300 # Maximum number of steps in an episode
TS_START = pd.Timestamp("2023-09-01 00:00:00") # Start of the episode
TS_END = pd.Timestamp("2023-09-10 23:59:59") # End of the episode
DETERMINISTIC = False # Indicates whether to use a deterministic environment
WIN = 0 # Window size for the features computation
LOGGING = False # Indicates whether to log events
TS_SAVE = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Ts for model saving
traders = {} # Dictionary of traders

print("Timestamp for saving: ", TS_SAVE)

In [None]:
# Set the parameters for the RL agent
rl_trader_id = "RLMarketMaker"
com_model = BinanceCommissions(tier=10)
volume = 10
# TODO: Update commissions and volume

trader = RLMarketMaker(
    id=rl_trader_id,
    com_model=com_model,
    volume=volume,
)
traders[rl_trader_id] = trader

# Write a description of the experiment
description = "RL market maker simulation."

### Register the limit order book environment

In [None]:
# Set the parameters for the environment
ID = "LimitOrderBookGym-v0"
ENTRY_POINT=LimitOrderBookGym
KWARGS = {
    "exchange_name": EXCHANGE_NAME,
    "symbol_name": SYMBOL,
    "tick_size": TICK_SIZE,
    "lot_size": LOT_SIZE,
    "depth": DEPTH,
    "order_flow_penalty": ORDER_FLOW_PENALTY,
    "traders": traders,
    "max_steps": MAX_STEPS,
    "ts_start": TS_START,
    "ts_end": TS_END,
    "deterministic": DETERMINISTIC,
    "win": WIN,
    "path": PATH,
    "rl_trader_id": rl_trader_id,
    "logging": LOGGING,
    "ts_save": TS_SAVE,
    "description": description,
}

# Register the environment
gym.envs.register(
    id=ID,
    entry_point=ENTRY_POINT,
    kwargs=KWARGS,
    max_episode_steps=MAX_STEPS,
)

# Create the environment
env = Monitor(gym.make(ID))
check_env(env)
env.reset()

# Save the saving ts
ts_save = env.unwrapped.exchange.ts_save
print(f"Saving ts: {ts_save}")

In [None]:
#  Visualize the deterministic policy
env.reset()
terminated = False
while not terminated:
    # action = env.action_space.sample()  # this is where you would insert your policy
    action = 12
    observation, reward, terminated, truncated, info = env.step(action)
    print(f"Reward: {reward}")
    print()
        
    print(f"Observation: {observation}")

### Define custom tensors and methods for better monitoring

In [None]:
# Define custom tensors for monitoring 
monitor_states_orig = [
    th.tensor([-1]),
    th.tensor([-0.5]),
    th.tensor([0]),
    th.tensor([0.5]),
    th.tensor([1]),
]
n_actions = 22
monitor_actions_orig = [
    th.tensor(x) for x in range(n_actions)
]
eye = th.eye(n_actions)
monitor_actions_hot_orig = [eye[x] for x in range(n_actions)]

monitor_states = th.stack(
    [x for x in monitor_states_orig for _ in range(n_actions)]
).to(DEVICE)
monitor_actions = th.stack(
    monitor_actions_orig * len(monitor_states_orig)
).to(DEVICE)
monitor_actions_hot = th.stack(
    monitor_actions_hot_orig * len(monitor_states_orig)
).to(DEVICE)

In [None]:
def evaluate_probabilities(
    model: PPO,
    monitor_states: th.Tensor,
    monitor_states_orig: list,
    monitor_actions: th.Tensor,
    n_actions: int,
) -> pd.DataFrame:
    _, logprobs_policy, _ = model.policy.evaluate_actions(
        monitor_states,
        monitor_actions,
    )
    probs_policy = th.exp(logprobs_policy).reshape(
    len(monitor_states_orig), n_actions
    )
    probs_policy = np.hstack(
        [
            th.stack(monitor_states_orig).detach().numpy(),
            probs_policy.cpu().detach().numpy(),
        ]
    )
    
    # Convert to dataframe
    df_probs_policy = pd.DataFrame(
        probs_policy,
        columns=["state", *[f"A{x}" for x in range(n_actions)]],
    )
    df_probs_policy = df_probs_policy.round(2)

    return df_probs_policy

### Initialize the reinforcement learning agent

In [None]:
# Set the parameters for the underlying policy
learning_rate = 0.001           # Learning rate, can be a function of progress

n_steps = 4500                  # Number of steps to run for each environment per update
batch_size = 15                 # Mini batch size for each gradient update
n_epochs = 10                   # Number of epoch when optimizing the surrogate loss

gamma = 0                       # Discount factor
gae_lambda = 0.95               # Generalized Advantage Estimator factor 
clip_range = 0.1                # Clipping parameter, can be a function of progress
ent_coef = 0.01                 # Entropy coefficient for the loss calculation
vf_coef = 0.5                   # Value function coefficient for the loss calculation
max_grad_norm = 0.5             # The maximum value for the gradient clipping

seed = SEED                     # Seed for the pseudo random generators
verbose = 0                     # Verbosity level: 0 no output, 1 info, 2 debug
normalize_advantage = True      # Whether to normalize or not the advantage

clip_range_vf = None            # Clip for the value function, can be a func of progress
use_sde = False                 # Whether to use State Dependent Exploration or not
sde_sample_freq = -1            # Sample a new noise matrix every n steps (-1 = disable)\

In [None]:
# Initialize the learner policy
learner = PPO(
    env=env,
    policy=MlpPolicy,
    learning_rate=learning_rate,
    n_steps=n_steps,
    batch_size=batch_size,
    n_epochs=n_epochs,
    gamma=gamma,
    gae_lambda=gae_lambda,
    clip_range=clip_range,
    clip_range_vf=clip_range_vf,
    normalize_advantage=normalize_advantage,
    ent_coef=ent_coef,
    vf_coef=vf_coef,
    max_grad_norm=max_grad_norm,
    use_sde=use_sde,
    sde_sample_freq=sde_sample_freq,
    verbose=verbose,
    seed=seed,
    device=DEVICE,
)

In [None]:
# Evaluate the random policy
mean_reward, std_reward = evaluate_policy(learner, env, n_eval_episodes=5, deterministic=False)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
#  Visualize the policy before training
observation = env.reset()[0]

terminated = False
while not terminated:
    # action = env.action_space.sample()  # this is where you would insert your policy
    action, _ = learner.predict(observation, deterministic=True)
    print(f"Observation: {observation}")
    print(f"Action: {action}")
    observation, reward, terminated, truncated, info = env.step(action)
    print(f"Reward: {reward}")
    print()

In [None]:
# Evaluate the probabilities of states and actions
probs = evaluate_probabilities(
    model=learner,
    monitor_states=monitor_states,
    monitor_states_orig=monitor_states_orig,
    monitor_actions=monitor_actions,
    n_actions=n_actions,
)
probs

### Train the agent

In [None]:
train_steps = 3000

# Train the agent
for i in range(15):
    # Train the agent for n steps
    learner.learn(total_timesteps=train_steps, progress_bar=False)
    
    # Evaluate the probabilities of states and actions
    probs = evaluate_probabilities(
        model=learner,
        monitor_states=monitor_states,
        monitor_states_orig=monitor_states_orig,
        monitor_actions=monitor_actions,
        n_actions=n_actions,
    )
    print("Probabilities for iteration: ", i)
    print(probs)
    print()

In [None]:
# # Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(learner, env, n_eval_episodes=5)
print(f"Mean reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
# Visualize the probabilities of states and actions
probs = evaluate_probabilities(
        model=learner,
        monitor_states=monitor_states,
        monitor_states_orig=monitor_states_orig,
        monitor_actions=monitor_actions,
        n_actions=n_actions,
    )
probs

In [None]:
#  Visualize the learned policy
observation = env.reset()[0]

terminated = False
while not terminated:
    # action = env.action_space.sample()  # this is where you would insert your policy
    action, _ = learner.predict(observation, deterministic=True)
    print(f"Observation: {observation}")
    print(f"Action: {action}")
    observation, reward, terminated, truncated, info = env.step(action)
    print(f"Reward: {reward}")
    print()
        