# Adversarial Inverse Reinforcement Learning for Market Making (2024)

This notebook contains the code for the paper _Adversarial Inverse Reinforcement Learning for Market Making_ published in the proceedings of the [ICAIF'24](https://ai-finance.org/) conference.

In [None]:
import os
import time
from datetime import datetime

import gymnasium as gym
import numpy as np
import pandas as pd
import torch as th
from imitation.algorithms.adversarial.airl import AIRL
from imitation.data import rollout, serialize
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.util.util import make_vec_env
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecCheckNan
from stable_baselines3.ppo import MlpPolicy

from src.lob.commissions import BitCommissions
from src.lob.exchange import Exchange
from src.lob.plots import set_plot_style, visualize_backtest
from src.lob.traders import RLMarketMaker
from src.lob.utils import get_lot_size, get_tick_size
from src.rl.environments import LimitOrderBookGym
from src.rl.experts import ExpertPolicyV1, RandomPolicyV1
from src.rl.plotting import visualize_airl_train_stats
from src.rl.rewards import NegativeRewardNet
from src.rl.utils import load_model, save_model

In [None]:
# Set plot style
set_plot_style()

# Set device
DEVICE = th.device("cuda" if th.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

In [None]:
# Set strict error checking
th.autograd.set_detect_anomaly(True)
np.seterr(all="raise")

In [None]:
# Seed for the pseudo random generator
# SEED = 1
# SEED = 2
# SEED = 3
# SEED = 4
SEED = 5

In [None]:
# Set the paths
PATH = "~/Projects/airl-market-making/data/pricing/"
PATH_VOL_DISTR = "~/Projects/airl-market-making/data/volume_distributions/"
PATH_ROLLOUTS = "data/rollouts/rollouts_2024-01-20_18-33-28.pkl"

### Register custom vectorized environment

In this section I load the limit order book gym environment and register it as a custom vectorized environment. This is necessary for the `stable-baselines3` library to work with the environment.

In [None]:
# Set the parameters
EXCHANGE_NAME = "BIT.COM"
SYMBOL = "SOL-USDT"
TICK_SIZE = get_tick_size(EXCHANGE_NAME)  # Tick size of the limit order book
LOT_SIZE = get_lot_size(EXCHANGE_NAME)  # Lot size of the limit order book
DEPTH = 20  # Depth of the data to load to the limit order book (max 20)
EXCHANGE_TRADER_ID = "Exchange"
MAX_STEPS = 300  # Maximum number of steps in an episode
TS_START = pd.Timestamp("2023-09-01 00:00:00")  # Start of the episode
TS_END = pd.Timestamp("2023-09-10 23:59:59")  # End of the episode
DETERMINISTIC = False  # Indicates whether to use a deterministic environment
WIN = 0  # Window size for the features computation
LOGGING = False  # Indicates whether to log events
TS_SAVE = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")  # Ts for model saving
LATENCY_COMP_PARAMS = {}  # Parameters for the stochastic backtest
RNG = np.random.default_rng(seed=SEED)  # Random number generator
traders = {}  # Dictionary of traders

print("Timestamp for saving: ", TS_SAVE)

In [None]:
# Set the parameters for the RL agent
rl_trader_id = "RLMarketMaker"
com_model = BitCommissions(tier=5)
volume = 100

# Initialize the trader
trader = RLMarketMaker(
    id=rl_trader_id,
    com_model=com_model,
    volume=volume,
)
traders[rl_trader_id] = trader

# Write a description of the experiment
description = "RL market maker simulation."

In [None]:
# Set the parameters for the environment
ID = "LimitOrderBookGym-v1"
ENTRY_POINT = LimitOrderBookGym
KWARGS = {
    "exchange_name": EXCHANGE_NAME,
    "symbol_name": SYMBOL,
    "tick_size": TICK_SIZE,
    "lot_size": LOT_SIZE,
    "depth": DEPTH,
    "traders": traders,
    "max_steps": MAX_STEPS,
    "ts_start": TS_START,
    "ts_end": TS_END,
    "deterministic": DETERMINISTIC,
    "win": WIN,
    "path": PATH,
    "path_vol_distr": PATH_VOL_DISTR,
    "rl_trader_id": rl_trader_id,
    "latency_comp_params": LATENCY_COMP_PARAMS,
    "logging": LOGGING,
    "ts_save": TS_SAVE,
    "description": description,
    "rng": RNG,
}

# Register the environment
gym.envs.register(
    id=ID,
    entry_point=ENTRY_POINT,
    kwargs=KWARGS,
    max_episode_steps=MAX_STEPS,
)

# Create the environment
env = Monitor(gym.make(ID))

# Save the saving ts
ts_save = env.unwrapped.exchange.ts_save
print(f"Saving ts: {ts_save}")

In [None]:
# Create the vectorized environment
venv = make_vec_env(
    ID,
    rng=RNG,
    n_envs=1,
    post_wrappers=[
        lambda env, _: RolloutInfoWrapper(env)
    ],  # needed for computing rollouts later
    parallel=False,
)
venv = VecCheckNan(venv, raise_exception=True)  # Check for NaN observations
venv.reset()

### Generate rollouts with random and expert policies

In this section I define an expert policy that will be used as a target of the imitation.

In [None]:
# Set the parameters for the rollout
min_timesteps = None
min_episodes = 1

# Rollout the environment with a random policy
rollouts = rollout.rollout(
    None,  # Random policy
    venv,
    sample_until=rollout.make_sample_until(
        min_timesteps=min_timesteps, min_episodes=min_episodes
    ),
    rng=RNG,
)

# Print the first rollout
for i in range(len(rollouts[0].obs) - 1):
    print("Observation: ", rollouts[0].obs[i])
    print("Action: ", rollouts[0].acts[i])
    print()

In [None]:
# Initialize the random policy
random_policy = RandomPolicyV1(venv.action_space)

In [None]:
# Evaluate the random policy
reward_random_policy, _ = evaluate_policy(
    random_policy, env, 1, return_episode_rewards=True
)
print("Reward: ", np.mean(reward_random_policy))
print("Std   : ", np.std(reward_random_policy))

In [None]:
# Initialize the expert policy
expert = ExpertPolicyV1()

In [None]:
# Set the parameters for the rollout
min_timesteps = None
min_episodes = 1

# Rollout the environment with the expert policy
rollouts = rollout.rollout(
    expert.predict,
    venv,
    sample_until=rollout.make_sample_until(
        min_timesteps=min_timesteps, min_episodes=min_episodes
    ),
    rng=RNG,
)

# Print the first rollout
for i in range(len(rollouts[0].obs) - 1):
    state, act = rollouts[0].obs[i][0], rollouts[0].acts[i]
    print(f"State 0: {state: .3f} --> Action: {act}")

In [None]:
# Flatten the trajectories into transitions
transitions = rollout.flatten_trajectories(rollouts)
transitions

In [None]:
# Evaluate the expert
reward_expert_policy, _ = evaluate_policy(
    expert, venv, 1, return_episode_rewards=True
)
print("Reward: ", np.mean(reward_expert_policy))
print("Std   : ", np.std(reward_expert_policy))

In [None]:
# Load the expert trajectories
# path = "rollouts/rollouts_2024-01-20_18-33-28.pkl"


# If the rollouts file exists load the rollouts
if os.path.exists(PATH_ROLLOUTS):
    rollouts = serialize.load(PATH_ROLLOUTS)

# Else, generate the rollouts
else:
    # Set the parameters for the rollout
    min_timesteps = 45000 * 3 + 4500
    min_episodes = None

    # Rollout the environment with the expert policy
    print("Generating rollouts...")
    rollouts = rollout.rollout(
        expert.predict,
        venv,
        sample_until=rollout.make_sample_until(
            min_timesteps=min_timesteps, min_episodes=min_episodes
        ),
        rng=RNG,
    )

    # Ensure the directory exists and save the rollouts
    os.makedirs("data/rollouts", exist_ok=True)
    serialize.save(PATH_ROLLOUTS, rollouts)

# Print the first rollout
for i in range(len(rollouts[0].obs) - 1):
    print("Observation: ", rollouts[0].obs[i])
    print("Action: ", rollouts[0].acts[i])
    print()

### Adversarial Inverse Reinforcement Learning Agent

In this section I develop a pipeline for training the adversarial inverse reinforcement learning agent. The goal is to learn the reward function of the expert policy by training of the discriminator network and the agent policy network.

In [None]:
# Set parameters for PPO (generator)
learning_rate = 0.001  # Learning rate, can be a function of progress
batch_size = 60  # Mini batch size for each gradient update
n_epochs = 10  # N of epochs when optimizing the surrogate loss

gamma = 0.5  # Discount factor, focus on the current reward
gae_lambda = 0  # Generalized advantage estimation
clip_range = 0.1  # Clipping parameter
ent_coef = 0.01  # Entropy coefficient for the loss calculation
vf_coef = 0.5  # Value function coef. for the loss calculation
max_grad_norm = 0.5  # The maximum value for the gradient clipping

verbose = 0  # Verbosity level: 0 no output, 1 info, 2 debug
normalize_advantage = True  # Whether to normalize or not the advantage

clip_range_vf = None  # Clip for the value function
use_sde = False  # Use State Dependent Exploration
sde_sample_freq = -1  # SDE - noise matrix frequency (-1 = disable)

In [None]:
# Set the parameters for the (negative) reward net
use_state = True  # Current state is used for the reward
use_action = True  # Current action is used for the reward
use_next_state = False  # Next state is used for the reward
use_done = False  # Done flag is used for the reward

In [None]:
# Set the parameters for the AIRL trainer
gen_replay_buffer_capacity = None
allow_variable_horizon = True

disc_opt_kwargs = {
    "lr": 0.001,
}
policy_kwargs = {"use_expln": True}  # Fixing the issue with the NaNs

<font color='orange'>**Warning:**</font> Be careful with the settings below and use the multiples of episode length (otherwise you might run into unexpected issues with variable horizons during training).

In [None]:
# Set the number of timesteps, batch size and number of disc updates

# Total number of timesteps in the whole training
total_timesteps = 3000 * 600

# Generator
gen_train_timesteps = 3000  # N steps in the environment per one round
n_steps = gen_train_timesteps

# Discriminator batches
demo_minibatch_size = 60  # N samples in minibatch for one discrim. update
demo_batch_size = 300 * 10  # N samples in the batch of expert data (batch)
n_disc_updates_per_round = 4  # N discriminator updates per one round

In [None]:
# Initialize the learner policy
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    policy_kwargs=policy_kwargs,
    learning_rate=learning_rate,
    n_steps=n_steps,
    batch_size=batch_size,
    n_epochs=n_epochs,
    gamma=gamma,
    gae_lambda=gae_lambda,
    clip_range=clip_range,
    clip_range_vf=clip_range_vf,
    normalize_advantage=normalize_advantage,
    ent_coef=ent_coef,
    vf_coef=vf_coef,
    max_grad_norm=max_grad_norm,
    use_sde=use_sde,
    sde_sample_freq=sde_sample_freq,
    verbose=verbose,
    seed=SEED,
    device=DEVICE,
)

In [None]:
# Initialize the custom reward network
reward_net = NegativeRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    use_state=use_state,
    use_action=use_action,
    use_next_state=use_next_state,
    use_done=use_done,
)

In [None]:
# Initialize the AIRL trainer
airl_trainer = AIRL(
    demonstrations=rollouts,
    demo_batch_size=demo_batch_size,
    demo_minibatch_size=demo_minibatch_size,
    n_disc_updates_per_round=n_disc_updates_per_round,
    gen_train_timesteps=gen_train_timesteps,
    gen_replay_buffer_capacity=gen_replay_buffer_capacity,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
    allow_variable_horizon=allow_variable_horizon,
    disc_opt_kwargs=disc_opt_kwargs,
)

In [None]:
# Evaluate the policy before training
venv.seed(SEED)
learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 1, return_episode_rewards=True
)
print("Mean: ", np.mean(learner_rewards_before_training))
print("Std: ", np.std(learner_rewards_before_training))

In [None]:
# Visualize actions of the policy before training
for _ in range(1):
    obs = venv.reset()
    done = False
    while not done:
        print(obs)
        action, _ = learner.predict(obs, deterministic=True)
        print(action)
        print()
        obs, _, done, _ = venv.step(action)

In [None]:
# Train the model
airl_trainer.train(total_timesteps=total_timesteps)

In [None]:
# Evaluate the policy after training
venv.seed(SEED)
learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True
)
print("Mean: ", np.mean(learner_rewards_after_training))
print("Std: ", np.std(learner_rewards_after_training))

In [None]:
# Visualize actions of the policy after training
for _ in range(1):
    obs = venv.reset()
    done = False
    while not done:
        action, _ = learner.predict(obs, deterministic=True)
        print(f"Obs: {obs[0][0]} --> Action: {action}")
        obs, _, done, _ = venv.step(action)

In [None]:
# Access the training log statistics
stats = airl_trainer.logger._logger.stats

In [None]:
visualize_airl_train_stats(stats)

### Save the trained model and stats

In [None]:
# Save the model
save_path = os.path.join(os.getcwd(), "models")
ts = airl_trainer.ts_now
print(f"Saving the model with timestamp: {ts}")
save_model(learner, reward_net, stats, save_path, ts)

### Load the trained model

In [None]:
# Pick the timestamp of the model to load
# ts = "2024-01-21_17-12-35" # seed 1
# ts = "2024-01-22_18-03-01" # seed 2
# ts = "2024-01-23_19-14-27" # seed 3
# ts = "2024-01-24_09-40-47" # seed 4
ts = "2024-01-24_22-39-37"  # seed 5

In [None]:
# Load the model
load_path = os.path.join(os.getcwd(), "models")
learner, reward_net, stats = load_model(load_path, ts)

In [None]:
print(ts)

### Evaluate the trained model

In [None]:
# Visualize the stats
save_fig = True
visualize_airl_train_stats(stats, save_fig=save_fig)

In [None]:
# Evaluate the policy after training
venv.seed(SEED)
learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True
)
print("Mean: ", np.mean(learner_rewards_after_training))
print("Std: ", np.std(learner_rewards_after_training))

In [None]:
# Visualize actions of the policy after training
for _ in range(1):
    obs = venv.reset()
    done = False
    while not done:
        action, _ = learner.predict(obs, deterministic=True)
        print(f"Obs: {obs[0][0]: .5f} --> Action: {action}")
        obs, _, done, _ = venv.step(action)

In [None]:
# Set the parameters
EXCHANGE_NAME = "BIT.COM"
SYMBOL = "SOL-USDT"
PATH = "~/Projects/thesis-market-making/reinforcement-learning/data/"
TICK_SIZE = get_tick_size(EXCHANGE_NAME)  # Tick size of the limit order book
LOT_SIZE = get_lot_size(EXCHANGE_NAME)  # Lot size of the limit order book
DEPTH = 20  # Depth of the data to load to the limit order book (max 20)
EXCHANGE_TRADER_ID = "Exchange"
MAX_STEPS = None  # Maximum number of steps in an episode
TS_START = pd.Timestamp("2023-09-11 00:00:00")  # Start of the episode
TS_END = pd.Timestamp("2023-09-13 23:59:59")  # End of the episode
WIN = 0  # Window size for the features computation
LOGGING = False  # Indicates whether to log events
LATENCY_COMP_PARAMS = {
    0: {"prob": 0.9, "divisor": 1},
    1: {"prob": 0.9, "divisor": 1},
    2: {"prob": 0.9, "divisor": 1},
    3: {"prob": 0.9, "divisor": 1},
}  # Latency compensation parameters for the stochastic backtest
RNG = np.random.default_rng(seed=SEED)  # Random number generator

In [None]:
# Initialize the limit order book and traders
start = time.time()
traders = {}

# Behavior cloning agent
rl_trader_id = "RLMarketMaker"
com_model = BitCommissions(tier=5)
volume = 100
trader = RLMarketMaker(
    id=rl_trader_id,
    com_model=com_model,
    volume=volume,
    policy=learner.policy,
)
traders[rl_trader_id] = trader

description = "AIRL agent."

# Initialize the exchange
exchange = Exchange(
    exchange_name=EXCHANGE_NAME,
    symbol_name=SYMBOL,
    tick_size=TICK_SIZE,
    lot_size=LOT_SIZE,
    depth=DEPTH,
    traders=traders,
    max_steps=MAX_STEPS,
    ts_start=TS_START,
    ts_end=TS_END,
    win=WIN,
    path=PATH,
    rl_trader_id=rl_trader_id,
    latency_comp_params=LATENCY_COMP_PARAMS,
    logging=LOGGING,
    ts_save=TS_SAVE,
    description=description,
    rng=RNG,
)
end = round(time.time() - start, 2)
print(f"Time taken for initialization of the exchange: {end} sec.")

# Run the exchange simulation
start = time.time()
exchange.run()
end = round(time.time() - start, 2)
print(f"Time taken for running the exchange: {end} sec.")

In [None]:
timestamps = exchange.stats["ts"]
trader_stats = traders[rl_trader_id].stats
initial_cost = 20.5 * volume * 2

In [None]:
visualize_backtest(timestamps, trader_stats, initial_cost)