# Construct a custom Environment for Financial Trading

Some examples on the market
* [custom env example](https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/sb3/5_custom_gym_env.ipynb#scrollTo=RqxatIwPOXe_)
* [StockTradingEnv by Adam King](https://github.com/notadamking/Stock-Trading-Environment)
* [FinRL](https://github.com/AI4Finance-Foundation/FinRL)

Target is to construct a custom Env for pair trading

This env gives the RL learner freedom to operate whatever it wants. Even long n short simultaneously.

In [1]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from datetime import date
from sklearn.model_selection import train_test_split

from utils.read2df import read2df

Define data parameters

In [2]:
symbols = ['BTCUSDT', 'ETHUSDT', 'LTCUSDT', 'XMRUSDT', 'BNBUSDT', 'ADAUSDT', 'DOGEUSDT', 'SOLUSDT', 'TRXUSDT']
start_date = '2018-01-01'

# freqs = {'1h':60, '2h':120, '4h':240, '6h':360, '8h':480, '12h':720, '1d':1440}
freqs = {'1m':1, '3m':3, '5m':5, '15m':15, '30m':30}

Download data from `binance-public-data`

In [3]:
%%capture
if symbols is None:
    !python binance-public-data/python/download-kline.py -i {" ".join(list(freqs.keys()))} -startDate {start_date} -t spot -skip-daily 1
else:
    !python binance-public-data/python/download-kline.py -s {" ".join(symbols)} -i {" ".join(list(freqs.keys()))} -startDate {start_date} -t spot -skip-daily 1

In [4]:
# dfs = read2df(symbols, freqs)
dfs = read2df(symbols, freqs)

df0 = dfs[0][dfs[0]['tic']=='BTCUSDT'].reset_index(drop=True)
df1 = dfs[0][dfs[0]['tic']=='ETHUSDT'].reset_index(drop=True)

Set data before `trade_data` as training data, after `trade_data` is trade_data

In [5]:
trade_date = '2023-01-01'

train0 = df0[df0['datetime'] < trade_date]
train1 = df1[df1['datetime'] < trade_date]

trade0 = df0[df0['datetime'] >= trade_date]
trade1 = df1[df1['datetime'] >= trade_date]

In [6]:
# Don't use custom observation & action spaces
# See the warning on https://gymnasium.farama.org/api/spaces/

'''
class PairTradingActionSpace(gym.Space):
  def __init__(self, low=-1.0, high=1.0, shape=(2, ), dtype=np.float32):
    super().__init__(shape, dtype)
    self.low = low
    self.high = high

  def sample(self):
    action = np.random.uniform(self.low, self.high, self.shape)
    # Normalize the action so that the summation of action[0] and action[1] is within -1 and 1.
    action = action / np.linalg.norm(action)
    return action

  def contains(self, x):
    return np.all(self.low <= x) and np.all(x <= self.high) and np.linalg.norm(x) <= 1.0
'''

'\nclass PairTradingActionSpace(gym.Space):\n  def __init__(self, low=-1.0, high=1.0, shape=(2, ), dtype=np.float32):\n    super().__init__(shape, dtype)\n    self.low = low\n    self.high = high\n\n  def sample(self):\n    action = np.random.uniform(self.low, self.high, self.shape)\n    # Normalize the action so that the summation of action[0] and action[1] is within -1 and 1.\n    action = action / np.linalg.norm(action)\n    return action\n\n  def contains(self, x):\n    return np.all(self.low <= x) and np.all(x <= self.high) and np.linalg.norm(x) <= 1.0\n'

# Define the custom Environment

In [7]:
# The lookback period for the observation space
PERIOD = 30
CASH = 10000

class PairTradingEnv(gym.Env):
    metadata = {'render.modes': ['console']}

    # for pair trading, we need to feed in two OHLCV dataframes
    def __init__(self, df0, df1, tc=0.001):
        super().__init__()

        if not df0['time'].equals(df1['time']):
            raise ValueError("Two dataframe must have same time index")

        self.tic0 = df0['tic'].iloc[0]
        self.tic1 = df1['tic'].iloc[0]

        # transaction cost
        self.tc = tc

        # get two datasets
        self.df0 = df0[['time', 'open', 'high', 'low', 'close', 'volume']]
        self.df1 = df1[['time', 'open', 'high', 'low', 'close', 'volume']]

        self.reward_range = (-np.inf, np.inf)

        # -1 means short 100%, 1 means long 100%, 0 means do nothing
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2, ), dtype=np.float32)

        # The data requires to be at least [time, open, high, low, close, volume]
        # Let's assume that we feed in previous 30 period data into the observation_space
        self.observation_space = spaces.Box(low=0.0, high=np.inf, shape=(2*PERIOD*6,), dtype=np.float64)

        # if the length is 35, then the index shall be 0~34
        self.max_steps = len(df0)-1

    def _next_observation(self):
        # The current step is always higher than the PERIOD as defined in the 

        obs_df0 = self.df0.iloc[self.current_step-PERIOD: self.current_step]
        obs_df1 = self.df1.iloc[self.current_step-PERIOD: self.current_step]

        obs = np.array([obs_df0, obs_df1]).flatten()

        return obs

    def _take_action(self, action):
        self.action = action

        current_price0 = self.df0['close'].iloc[self.current_step]
        current_price1 = self.df1['close'].iloc[self.current_step]

        # evaluate purchasing power 
        max_amount0 = self.net_worth/current_price0
        max_amount1 = self.net_worth/current_price1

        curr_holding0 = self.holding0/max_amount0
        curr_holding1 = self.holding1/max_amount1

        # clip the action to the summation of [-1, 1]
        if sum(self.action) > 1:
            action0 = self.action[0]/(sum(self.action)+self.tc)
            action1 = self.action[1]/(sum(self.action)+self.tc)
            self.action = [action0, action1]
        elif sum(self.action) < -1:
            action0 = self.action[0]/(sum(self.action)-self.tc)
            action1 = self.action[1]/(sum(self.action)-self.tc)

        # if curr_h is -70%, action is -40%, then we need to clip the action to -30%
        if curr_holding0 + self.action[0] > 1:
            self.action[0] = 1 - curr_holding0
        elif curr_holding0 + self.action[0] < -1:
            self.action[0] = -1 - curr_holding0

        if curr_holding1 + self.action[1] > 1:
            self.action[1] = 1 - curr_holding1
        elif curr_holding0 + self.action[0] < -1:
            self.action[1] = -1 - curr_holding1

        self.holding0 += self.action[0]*max_amount0
        self.holding1 += self.action[1]*max_amount1
        self.cash -= self.cash*sum(action)*(1+self.tc)

        # We record the net_worth from previous period to prev_net_worth
        self.prev_net_worth = self.net_worth
        self.net_worth = self.cash + self.holding0*current_price0 + self.holding1*current_price1

    def step(self, action):
        self._take_action(action)
        self.current_step += 1

        observation = self._next_observation()
        reward = self.net_worth - self.prev_net_worth
        terminated = bool(self.current_step >= self.max_steps)
        truncated = bool(self.net_worth <= 0)
        info = {}

        return observation, reward, terminated, truncated, info

    def reset(self, seed=None):
        np.random.seed(seed)
        
        self.cash = CASH
        self.net_worth = CASH
        self.max_net_worth = CASH
        self.holding0 = 0
        self.holding1 = 0

        self.current_step = np.random.randint(PERIOD, self.max_steps)

        return self._next_observation(), {}
    
    def render(self):
        profit = self.net_worth - CASH

        print(f"Current profit is {profit}, cash is {self.cash}, net worth is {self.net_worth}")
        print(f"Actions for this step is {self.tic0} for {self.action[0]} and {self.tic1} for {self.action[1]}")
        print(f"Current holding is {self.holding0} of {self.tic0} and {self.holding1} of {self.tic1}")

## Check with baselin3 `env_checker`

Check if the env meets the requirements of `stable_baseline3`

In [8]:
from stable_baselines3.common.env_checker import check_env

env = PairTradingEnv(train0, train1)
check_env(env)

## Do a test run with random generated actions

In [14]:
import random

env = PairTradingEnv(train0, train1)

obs, _ = env.reset()

print(f"observation_space: {env.observation_space}")
print(f"action_space: {env.action_space}")
print(f"action_space.sample: {env.action_space.sample()}")

n_steps = 100

for step in range(n_steps):
    print(f"Step {step + 1}")
    obs, reward, terminated, truncated, info = env.step(action=[random.uniform(-1, 1) for _ in range(2)])
    done = terminated or truncated
    env.render()
    if done:
        print("Test Finished!")
        break

observation_space: Box(0.0, inf, (360,), float64)
action_space: Box(-1.0, 1.0, (2,), float32)
action_space.sample: [-0.5906395   0.37275356]
Step 1
Current profit is 0.8161022665972268, cash is 10816.91836886226, net worth is 10000.816102266597
Actions for this step is BTCUSDT for -0.8730406171291096 and ETHUSDT for 0.7914303904695432
Current holding is -0.1875142008291285 of BTCUSDT and 2.0178071925143177 of ETHUSDT
Step 2
Current profit is -67.55682499692921, cash is 9932.44317500307, net worth is 9932.44317500307
Actions for this step is BTCUSDT for -0.12597763228256909 and ETHUSDT for 0.20766370701531855
Current holding is -0.21454164991089947 of BTCUSDT and 2.5466550130038392 of ETHUSDT
Step 3
Current profit is -69.73613432453749, cash is 9930.263865675463, net worth is 9930.263865675463
Actions for this step is BTCUSDT for 0.007790137182814849 and ETHUSDT for -0.007570943158036414
Current holding is -0.21288326011071215 of BTCUSDT and 2.5275193079921907 of ETHUSDT
Step 4
Current 

## PPO model from stable_baselines3

Train with training data

In [15]:
from stable_baselines3 import PPO

env = PairTradingEnv(train0, train1)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)
model.save("ppo_pairtrading")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5.78     |
|    ep_rew_mean     | -3.4e+04 |
| time/              |          |
|    fps             | 2989     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 5.69          |
|    ep_rew_mean          | -7.85e+04     |
| time/                   |               |
|    fps                  | 2170          |
|    iterations           | 2             |
|    time_elapsed         | 1             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00015644016 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2       

## Use the model on Trade data

In [16]:
del model
model = PPO.load("ppo_pairtrading")

In [19]:
env = PairTradingEnv(trade0, trade1)

env.reset()
while True:
    action, _states = model.predict(obs)
    observation, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    env.render()
    if done:
        print("Test Finished!")
        break

Current profit is -2071.056590032498, cash is -2062.7652072906476, net worth is 7928.943409967502
Actions for this step is BTCUSDT for 0.17003258754051856 and ETHUSDT for 0.8291382741852962
Current holding is 0.0595668802747814 of BTCUSDT and 4.488014691522348 of ETHUSDT
Current profit is 2401.290374679047, cash is -1138.599144590546, net worth is 12401.290374679047
Actions for this step is BTCUSDT for 0.7855759263038635 and ETHUSDT for -0.3380005955696106
Current holding is 0.27779943618792036 of BTCUSDT and 3.0372946900030073 of ETHUSDT
Current profit is 12649.202002792757, cash is 519.9178915542943, net worth is 22649.202002792757
Actions for this step is BTCUSDT for 0.3606212961901629 and ETHUSDT for 0.3318531481411479
Current holding is 0.4344834048000182 of BTCUSDT and 5.264379230668176 of ETHUSDT
Current profit is 23336.351444272805, cash is 268.5045000361299, net worth is 33336.351444272805
Actions for this step is BTCUSDT for -0.08750295639038086 and ETHUSDT for 0.570583522319