In [None]:
import gym
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import tensorboard
import torch as th
from scipy import stats

from copy import deepcopy

from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env
import stable_baselines3

from stochastic.processes.continuous import BrownianMotion, GeometricBrownianMotion, BesselProcess, BrownianBridge, BrownianMeander
from stochastic.processes.diffusion import ConstantElasticityVarianceProcess

import sys
sys.path.append("../") # <-- Path to the main repo

from main.agents.Agent import Agent
from main.agents.AvellanedaStoikovAgent import AvellanedaStoikovAgent
from main.agents.BaselineAgents import RandomAgent, FixedSpreadAgent
from main.agents.SBAgent import SBAgent
from main.gym.ModelBasedEnvironment import ModelBasedEnvironment
from main.gym.models import *
from main.gym.wrappers import *
from main.gym.AvellanedaStoikovEnvironment import AvellanedaStoikovEnvironment
from main.gym.helpers.generate_trajectory import generate_trajectory
from main.rewards.RewardFunctions import PnL#InventoryAdjustedPnL
from main.gym.helpers.plotting import plot_stable_baselines_actions

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

### Learning Inventory-neutral behaviour with SAC

In [None]:
# Add a linearly decreasing learning rate function
def linear_schedule(initial_value):
    def func(progress):
        return progress * initial_value

    return func
schedule = linear_schedule(0.00003) # Here, we use the default SB value

In [None]:
tensorboard_logdir = "./tensorboard/SAC-learning-AS-pnl/"
best_model_path = "./SB_models/PPO-best-PnL"

In [None]:
terminal_time = 1.0
n_steps = 1000
arrival_rate = 50.0
timestamps = np.linspace(0, terminal_time, n_steps + 1)
env_params = dict(terminal_time=terminal_time, n_steps=n_steps, arrival_rate=arrival_rate)
as_env = AvellanedaStoikovEnvironment(**env_params)
reduced_env = ReduceStateSizeWrapper(as_env)

n_envs = 6
gym.envs.register(id="as-env-v0", entry_point="__main__:AvellanedaStoikovEnvironment", kwargs=env_params)
vec_env = make_vec_env(env_id="as-env-v0", n_envs=n_envs, wrapper_class=ReduceStateSizeWrapper)

In [None]:
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))

In [None]:
sac_params = {"policy":'MlpPolicy', "env": vec_env, "verbose":1, 
              "policy_kwargs":policy_kwargs, 
              "tensorboard_log":tensorboard_logdir,
              "batch_size": 2048, "learning_rate": schedule} #256 before (batch size)
callback_params = dict(eval_env=reduced_env, n_eval_episodes = 2048, #200 before  (n_eval_episodes)
                       best_model_save_path = best_model_path, 
                       deterministic=True)

In [None]:
callback = EvalCallback(**callback_params)
model = SAC(**sac_params)

In [None]:
model.learning_rate = linear_schedule(0.00001*0.01)
#model.batch_size = 256*2

In [None]:
model.learn(total_timesteps = 1_000_000, callback=callback)

### Plotting the agent's action against their inventory

In [None]:
best_model = SAC.load(best_model_path+"/best_model")

In [None]:
actions = np.array([best_model.predict([0,0.5])[0][0] for _ in range(10000)])
plt.hist(actions, bins = 100, density=True)
plt.show()

In [None]:
inventories = [-3,-2,-1,0,1,2,3]
[best_model.predict([inventory,0. ], deterministic=True)[0] for inventory in inventories]

In [None]:
print(f"Mean action: {np.median(actions)}, Median action: {np.mean(actions)}")

In [None]:
best_model = SAC.load(best_model_path+"/best_model")
#best_model = model
inventories = np.arange(-10,11,1)#[-3,-2,-1,0,1,2,3]
as_agent = AvellanedaStoikovAgent(risk_aversion=0)
as_actions = np.array([as_agent.get_action([100,0,inventory,0.0]) for inventory in inventories])
actions = np.array([best_model.predict([inventory,0.6 ], deterministic=True)[0] for inventory in inventories])
plt.plot(inventories, actions[:,0], label="bid")
plt.plot(inventories, actions[:,1], label="ask")
plt.plot(inventories, as_actions[:,0], label="optimal AS action")
plt.legend()
plt.show()
actions = {}
for inventory in inventories:
    actions[inventory] = np.array([best_model.predict([inventory,ts], deterministic=True)[0] for ts in timestamps])    
    plt.plot(timestamps, actions[inventory][:,0], label = "bid"+str(inventory))
    plt.plot(timestamps, actions[inventory][:,1], label = "ask"+str(inventory))
as_actions = np.array([as_agent.get_action([0,0,0,0]) for ts in timestamps])
plt.plot(timestamps, as_actions[:,0], label="AS-action")
plt.legend()
plt.show()

In [None]:
actions = np.array([best_model.predict([inventory,3])[0][0] for _ in range(10000)])
plt.hist(actions, bins = 100, density=True)
plt.show()