# Construct a custom Environment for Pair Trading

Some examples on the market
* [custom env example](https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/sb3/5_custom_gym_env.ipynb#scrollTo=RqxatIwPOXe_)
* [StockTradingEnv by Adam King](https://github.com/notadamking/Stock-Trading-Environment)
* [FinRL](https://github.com/AI4Finance-Foundation/FinRL)

Target is to construct a custom Env for pair trading

This env restrict the behaviour of RL learner to pair trading only

In [1]:
import os
import pickle

from stable_baselines3 import PPO, A2C, DQN

from params import *
from utils.read2df import read2df, unify_dfs
from utils.clearlogs import clear_logs
from envs.env_rl_restrict_thres2 import RL_Restrict_TradeEnv

folder_path = f"result/rl-restrict-thres"
os.makedirs(folder_path, exist_ok=True)

Load data from `preliminaries.ipynb`

In [2]:
with open('result/cointncorr.pickle', 'rb') as pk:
    data = pickle.load(pk)

with open('result/gridsearch/best_res.pickle', 'rb') as pk:
    best_profit, best_params = pickle.load(pk)

dfs = read2df(symbols=data[0], freqs={data[1]: freqs[data[1]]})

tics, df = unify_dfs(dfs, symbols=data[0], period=best_params['period'])

df.head(10)

Unnamed: 0,time,close0,itvl,datetime,close1,spread,zscore
0,1690848059999,0.00059,1m,2023-08-01 00:00:59.999,0.000689,-9.9e-05,0.0
1,1690848119999,0.00059,1m,2023-08-01 00:01:59.999,0.000689,-9.9e-05,0.0
2,1690848179999,0.00059,1m,2023-08-01 00:02:59.999,0.000689,-9.9e-05,0.0
3,1690848239999,0.00059,1m,2023-08-01 00:03:59.999,0.000689,-9.9e-05,0.0
4,1690848299999,0.00059,1m,2023-08-01 00:04:59.999,0.000689,-9.9e-05,0.0
5,1690848359999,0.000591,1m,2023-08-01 00:05:59.999,0.00069,-9.9e-05,0.0
6,1690848419999,0.000591,1m,2023-08-01 00:06:59.999,0.00069,-9.9e-05,0.0
7,1690848479999,0.000591,1m,2023-08-01 00:07:59.999,0.00069,-9.9e-05,0.0
8,1690848539999,0.000591,1m,2023-08-01 00:08:59.999,0.00069,-9.9e-05,0.0
9,1690848599999,0.000591,1m,2023-08-01 00:09:59.999,0.00069,-9.9e-05,0.0


In [3]:
best_params['OPEN_THRE']

1.6

In [4]:
train = df[(df['datetime'] >= datetime.strptime(start_date, date_format)) & (df['datetime'] < datetime.strptime(trade_date, date_format))]
test = df[(df['datetime'] >= datetime.strptime(trade_date, date_format)) & (df['datetime'] < datetime.strptime(end_date, date_format))]

max_train_len = len(train)-best_params['period']-1
print(f"The length of our training data: {len(train)}")

The length of our training data: 87840


## Check with baselin3 `env_checker`

Check if the env meets the requirements of `stable_baseline3`

In [5]:
from stable_baselines3.common.env_checker import check_env
# > UserWarning: The action space is not based off a numpy array. Typically this means it's either a Dict or Tuple space.
# This type of action space is currently not supported by Stable Baselines 3. You should try to flatten the action using a wrapper.
# Baseline 3 does not support Dict/Tuple action spaces....only Box Discrete MultiDiscrete MultiBinary
# Is there another way to achieve the same functionality?

env = RL_Restrict_TradeEnv(train)
check_env(env)

In [6]:
log_path = f"logs/restrict_thres/"
clear_logs(log_path)

# Read more about tensorboard
# https://github.com/tensorflow/tensorboard/blob/master/README.md
# https://www.tensorflow.org/tensorboard/get_started

Deleted: logs/restrict_thres/PPO_1


In [7]:
'''PPO'''

model_ppo = PPO("MlpPolicy", env, verbose=1, gamma=1, tensorboard_log=log_path)
model_ppo.learn(total_timesteps= max_train_len)
model_ppo.save(f"{folder_path}/ppo_pairtrading")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs/restrict_thres/PPO_1
-----------------------------
| time/              |      |
|    fps             | 2404 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1518        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.020484898 |
|    clip_fraction        | 0.462       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.675      |
|    explained_variance   | -0.0199     |
|    learning_rate        | 0.0003      |
|    loss                 | 1.57        |
|    n_updates            | 10          |
|    policy_gradient_loss | -

In [8]:
try:
    os.remove(f"{folder_path}/networth_ppo.csv")
except OSError:
    pass

obs, _ = env.reset()

for i in range(20):
    env.render()
    action, _states = model_ppo.predict(obs)
    obs, rewards, terminated, truncated, info = env.step(action)

signal: 0, action: 1, reward:1
signal: 1, action: 1, reward:1
signal: 1, action: 1, reward:1
signal: 0, action: 1, reward:1
signal: 0, action: 1, reward:1
signal: 1, action: 1, reward:1
signal: 0, action: 1, reward:1
signal: 1, action: 1, reward:1
signal: 1, action: 1, reward:1
signal: 0, action: 1, reward:1
signal: 1, action: 1, reward:1
signal: 1, action: 1, reward:1
signal: 0, action: 1, reward:1
signal: 0, action: 1, reward:1
signal: 1, action: 1, reward:1
signal: 0, action: 1, reward:1
signal: 0, action: 1, reward:1
signal: 0, action: 1, reward:1
signal: 0, action: 1, reward:1
signal: 0, action: 1, reward:1
