### Import external modules

In [None]:
import gym
import matplotlib.pyplot as plt
import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecMonitor

### Add mbt-gym to path

In [None]:
import sys
sys.path.append("../")

In [None]:
from mbt_gym.agents.BaselineAgents import CarteaJaimungalMmAgent
from mbt_gym.gym.helpers.generate_trajectory import generate_trajectory
from mbt_gym.gym.StableBaselinesTradingEnvironment import StableBaselinesTradingEnvironment
from mbt_gym.gym.TradingEnvironment import TradingEnvironment
from mbt_gym.gym.ModelDynamics import LimitOrderModelDynamics
from mbt_gym.gym.wrappers import *
from mbt_gym.rewards.RewardFunctions import CjCriterion, CjMmCriterion
from mbt_gym.stochastic_processes.midprice_models import *
from mbt_gym.stochastic_processes.fill_probability_models import *
from mbt_gym.stochastic_processes.arrival_models import *

### Add parameters for limit order market making environment

In [None]:
terminal_time = 1.0
arrival_rate = 10.0
n_steps = int(10 * terminal_time * arrival_rate)
phi = 0.5
alpha = 0.001

In [None]:
def get_cj_env_Poisson(num_trajectories:int = 1):    
    fill_exponent = 1
    sigma = 0.1
    initial_inventory = (-4,5)
    initial_price = 100
    step_size = terminal_time/n_steps
    max_depth = 5
    timestamps = np.linspace(0, terminal_time, n_steps + 1)
    midprice_model = BrownianMotionMidpriceModel(volatility=sigma, 
                                                                   terminal_time=terminal_time, 
                                                                   step_size=step_size, 
                                                                   initial_price=initial_price, 
                                                                   num_trajectories=num_trajectories)
    arrival_model = PoissonArrivalModel(intensity=np.array([arrival_rate, arrival_rate]), step_size=step_size)
    fill_probability_model = ExponentialFillFunction(fill_exponent=fill_exponent, 
                                                                       step_size=step_size, 
                                                                       num_trajectories=num_trajectories)
    LOtrader = LimitOrderModelDynamics(midprice_model = midprice_model, arrival_model = arrival_model, 
                                fill_probability_model = fill_probability_model,
                                num_trajectories = num_trajectories, max_depth = max_depth)
    env_params = dict(terminal_time=terminal_time, 
                      n_steps=n_steps,
                      initial_inventory = initial_inventory,
                      model_dynamics = LOtrader,
                      reward_function = CjMmCriterion(phi, alpha),
                      max_inventory=n_steps,
                      num_trajectories=num_trajectories)
    return TradingEnvironment(**env_params)

In [None]:
def get_cj_env_Hawkes(num_trajectories:int = 1):    
    fill_exponent = 1
    sigma = 0.1
    initial_inventory = (-4,5)
    initial_price = 100
    max_depth = 5
    step_size = terminal_time/n_steps
    timestamps = np.linspace(0, terminal_time, n_steps + 1)
    midprice_model = BrownianMotionMidpriceModel(volatility=sigma, 
                                                                   terminal_time=terminal_time, 
                                                                   step_size=step_size, 
                                                                   initial_price=initial_price, 
                                                                   num_trajectories=num_trajectories)
    arrival_model = HawkesArrivalModel(num_trajectories=num_trajectories, step_size=step_size)
    fill_probability_model = ExponentialFillFunction(fill_exponent=fill_exponent, 
                                                                       step_size=step_size, 
                                                                       num_trajectories=num_trajectories)
    LOtrader = LimitOrderModelDynamics(midprice_model = midprice_model, arrival_model = arrival_model, 
                                fill_probability_model = fill_probability_model,
                                num_trajectories = num_trajectories, max_depth = max_depth)
    env_params = dict(terminal_time=terminal_time, 
                      n_steps=n_steps,
                      initial_inventory = initial_inventory,
                      model_dynamics = LOtrader,
                      reward_function = CjMmCriterion(phi, alpha),
                      max_inventory=n_steps,
                      num_trajectories=num_trajectories)
    return TradingEnvironment(**env_params)

In [None]:
num_trajectories = 1000
env = ReduceStateSizeWrapper(get_cj_env_Hawkes(num_trajectories), [1,2,4,5])
sb_env = StableBaselinesTradingEnvironment(trading_env=env)

In [None]:
cj_agent = CarteaJaimungalMmAgent(env=get_cj_env_Poisson(num_trajectories), max_inventory = 10)

In [None]:
# Monitor sb_env
sb_env = VecMonitor(sb_env)
# Add directory for tensorboard logging 
tensorboard_logdir = "./tensorboard/PPO-learning-Hawkes/"
best_model_path = "./SB_models/PPO-best-Hawkes"

### Define PPO policy

In [None]:
policy_kwargs = dict(net_arch=[dict(pi=[64, 64], vf=[128, 128])])
PPO_params = {"policy":'MlpPolicy', "env": sb_env, "verbose":1, 
              "policy_kwargs":policy_kwargs, 
              "tensorboard_log":tensorboard_logdir,
              "batch_size": int(n_steps * num_trajectories / 20), 
              "n_steps": int(n_steps)} #256 before (batch size)
callback_params = dict(eval_env=sb_env, n_eval_episodes = 2048, 
                       eval_freq = 200,#200 before  (n_eval_episodes)
                       best_model_save_path = best_model_path, 
                       deterministic=True)

callback = EvalCallback(**callback_params)

In [None]:
model = PPO(**PPO_params, device="cpu")

In [None]:
model.learn(total_timesteps = 1_000)

## Comparing the learnt policy to the optimal policy

In [None]:
from mbt_gym.agents.SbAgent import SbAgent

In [None]:
agent = SbAgent(model)

In [None]:
inventories = np.arange(-3,4,1)
bid_actions = {}
ask_actions = {}
intensities=[5,10,20]
for intensity in intensities:
    bid_actions[intensity] = []
    ask_actions[intensity] = []
    for inventory in inventories:
        bid_action, ask_action = np.reshape(model.predict([inventory,0.5, intensity, intensity], deterministic=True)[0], 2)    
        bid_actions[intensity].append(bid_action)
        ask_actions[intensity].append(ask_action)

In [None]:
#cj_agent.get_action(np.array([[0,inventory,0.5]]))

In [None]:
# Get the Cartea Jaimungal action
cj_bid_actions = []
cj_ask_actions = []
for inventory in inventories:
    bid_action, ask_action = cj_agent.get_action(np.array([[0,inventory,0.5]]))[0,:].reshape(-1)
    cj_bid_actions.append(bid_action)
    cj_ask_actions.append(ask_action)

In [None]:
#bid_actions

In [None]:
colors = ["k", "r", "b"]

for i, intensity in enumerate(intensities):
    plt.plot(inventories, bid_actions[intensity], label = f"bid - lambda = {intensity}", color = colors[i])
    plt.plot(inventories, ask_actions[intensity], label = f"ask - lambda = {intensity}", color = colors[i], linestyle = "--")
plt.legend()

In [None]:
timestamps = np.arange(0,1 + 0.01, 0.01)

In [None]:
bid_actions = {}
ask_actions = {}

# loop over intensities? (currently just using 10 below)
intensities = [5,10,15,20]

for intensity in intensities:

    bid_actions[intensity] = {}
    ask_actions[intensity] = {}
    
    for inventory in inventories:
        bid_actions[intensity][inventory] = []
        ask_actions[intensity][inventory] = []
        for timestamp in timestamps:
            state = np.array([[inventory, timestamp, intensity, intensity]])
            bid_action, ask_action = agent.get_action(state)[0]
            bid_actions[intensity][inventory].append(bid_action)
            ask_actions[intensity][inventory].append(ask_action)

In [None]:
#agent.get_action(state)

In [None]:
#model.predict(state, deterministic=True)

In [None]:
fig, axs = plt.subplots(1, len(intensities), figsize=(18, 5))

for i,intensity in enumerate(intensities):
    
    ax = axs[i]

    for inventory in inventories:
        ax.plot(timestamps, bid_actions[intensity][inventory], label=f"bid: q = {inventory}")
    ax.set_xlabel("Time")
    ax.set_ylabel("Bid action")  
    ax.set_title('intensity: %d' % intensity)
plt.legend()
plt.tight_layout()

plt.show()

In [None]:
for inventory in inventories:
    plt.plot(timestamps, ask_actions[inventory], label=f"ask: q = {inventory}")
plt.legend()
plt.show()

In [None]:
model.save("trained_model")

In [None]:
loaded = PPO.load("trained_model.zip")

In [None]:
loaded

In [None]:
ask_actions