In [1]:
import math
from tqdm import trange
from env_kim import RL_Kim_TradeEnv
from stable_baselines3 import PPO, A2C, DQN
from datetime import datetime, timedelta
from read2df import read2df, unify_dfs
from stable_baselines3.common.env_checker import check_env

# Prepare Dataset in OHLCVT format

In [2]:
dfs = read2df(symbols=['BTCEUR', 'BTCGBP'], freqs={'1m': 1}, marketType='spot')
tics, df = unify_dfs(dfs, symbols=['BTCEUR', 'BTCGBP'], period=30) # The period here is the formation period

df.describe()
df.head(10)

Unnamed: 0,time,close0,itvl,datetime,close1,spread,zscore
0,1672531259999,6.5e-05,1m,2023-01-01 00:00:59.999,7.3e-05,-10.824649,0.0
1,1672531319999,6.5e-05,1m,2023-01-01 00:01:59.999,7.3e-05,-10.824645,0.0
2,1672531379999,6.5e-05,1m,2023-01-01 00:02:59.999,7.3e-05,-10.824643,0.0
3,1672531439999,6.5e-05,1m,2023-01-01 00:03:59.999,7.3e-05,-10.823945,0.0
4,1672531499999,6.5e-05,1m,2023-01-01 00:04:59.999,7.3e-05,-10.823701,0.0
5,1672531559999,6.5e-05,1m,2023-01-01 00:05:59.999,7.3e-05,-10.823703,0.0
6,1672531619999,6.5e-05,1m,2023-01-01 00:06:59.999,7.3e-05,-10.82381,0.0
7,1672531679999,6.5e-05,1m,2023-01-01 00:07:59.999,7.3e-05,-10.823811,0.0
8,1672531739999,6.5e-05,1m,2023-01-01 00:08:59.999,7.3e-05,-10.823801,0.0
9,1672531799999,6.5e-05,1m,2023-01-01 00:09:59.999,7.3e-05,-10.823693,0.0


# Check the validity of the Environment
And test with random generated actions

In [3]:
env = RL_Kim_TradeEnv(df)
check_env(env)
for i in range(3):
    obs, rewards, terminated, truncated, info = env.step(action=env.action_space.sample())
    # env.render()

# Train with models

In [4]:
date_format = '%Y-%m-%d'
start_date = '2023-10-01'
trade_date = '2023-12-01'
end_date = '2023-12-31'

train = df[(df['datetime'] >= datetime.strptime(start_date, date_format)) & (df['datetime'] < datetime.strptime(trade_date, date_format))]
test = df[(df['datetime'] >= datetime.strptime(trade_date, date_format)) & (df['datetime'] < datetime.strptime(end_date, date_format))]

env_train = RL_Kim_TradeEnv(train)
env_test = RL_Kim_TradeEnv(test)

max_train_len = math.floor(len(train)/15)
max_test_len = math.floor(len(test)/15)

In [5]:
env_train.reset()
model_ppo = PPO("MlpPolicy", env_train, gamma=1, batch_size=256)
model_ppo.learn(total_timesteps=max_train_len, progress_bar=True)

Output()

<stable_baselines3.ppo.ppo.PPO at 0x2318dc1ac70>

In [6]:
env_train.reset()
model_a2c = A2C("MlpPolicy", env_train, gamma=1)
model_a2c.learn(total_timesteps=max_train_len, progress_bar=True)

Output()

<stable_baselines3.a2c.a2c.A2C at 0x231aaeb7550>

In [7]:
env_train.reset()
model_dqn = DQN("MlpPolicy", env_train, gamma=1, batch_size=256)
model_dqn.learn(total_timesteps=max_train_len, progress_bar=True)

Output()

<stable_baselines3.dqn.dqn.DQN at 0x2318d8b52e0>

# Out-of-sample experiment

In [8]:
env_test.reset()

for i in trange(max_test_len):
    action, _states = model_ppo.predict(obs)
    obs, rewards, terminated, truncated, info = env_test.step(action)
    if i%int(max_test_len/10)==0:
        env_test.render()

  2%|▏         | 41/2700 [00:00<00:12, 206.69it/s]

networth: 1.0007


 11%|█▏        | 310/2700 [00:01<00:09, 244.48it/s]

networth: 1.0236


 22%|██▏       | 590/2700 [00:02<00:08, 259.19it/s]

networth: 1.0329


 32%|███▏      | 864/2700 [00:03<00:06, 263.20it/s]

networth: 1.0103


 42%|████▏     | 1127/2700 [00:04<00:06, 244.47it/s]

networth: 0.9927


 51%|█████▏    | 1389/2700 [00:05<00:04, 296.61it/s]

networth: 0.9988


 61%|██████▏   | 1656/2700 [00:06<00:03, 268.67it/s]

networth: 0.957


 72%|███████▏  | 1954/2700 [00:07<00:02, 337.05it/s]

networth: 0.8505


 81%|████████▏ | 2195/2700 [00:08<00:01, 314.26it/s]

networth: 0.7647


 91%|█████████ | 2458/2700 [00:09<00:00, 380.13it/s]

networth: 0.6123


100%|██████████| 2700/2700 [00:09<00:00, 275.45it/s]


In [9]:
env_test.reset()

for i in trange(max_test_len):
    action, _states = model_a2c.predict(obs)
    obs, rewards, terminated, truncated, info = env_test.step(action)
    if i%int(max_test_len/10)==0:
        env_test.render()

  2%|▏         | 62/2700 [00:00<00:08, 309.82it/s]

networth: 1


 12%|█▏        | 327/2700 [00:01<00:08, 281.68it/s]

networth: 1.0086


 21%|██▏       | 580/2700 [00:01<00:07, 301.54it/s]

networth: 1.0187


 32%|███▏      | 853/2700 [00:02<00:06, 269.83it/s]

networth: 1.0471


 41%|████▏     | 1118/2700 [00:03<00:05, 271.97it/s]

networth: 1.0278


 51%|█████     | 1373/2700 [00:04<00:05, 253.79it/s]

networth: 1.0314


 62%|██████▏   | 1672/2700 [00:06<00:04, 235.35it/s]

networth: 1.0117


 71%|███████   | 1916/2700 [00:07<00:03, 219.68it/s]

networth: 0.9432


 81%|████████▏ | 2197/2700 [00:08<00:02, 215.43it/s]

networth: 0.8701


 91%|█████████▏| 2467/2700 [00:09<00:01, 227.70it/s]

networth: 0.7467


100%|██████████| 2700/2700 [00:10<00:00, 248.04it/s]


In [10]:
env_test.reset()

for i in trange(max_test_len):
    action, _states = model_dqn.predict(obs)
    obs, rewards, terminated, truncated, info = env_test.step(action)
    if i%int(max_test_len/10)==0:
        env_test.render()

  3%|▎         | 68/2700 [00:00<00:07, 341.63it/s]

networth: 1.0007


 12%|█▏        | 334/2700 [00:00<00:07, 328.99it/s]

networth: 1.0325


 23%|██▎       | 610/2700 [00:01<00:06, 309.21it/s]

networth: 1.0479


 32%|███▏      | 851/2700 [00:02<00:05, 326.41it/s]

networth: 1.0095


 42%|████▏     | 1121/2700 [00:03<00:04, 324.75it/s]

networth: 0.9775


 52%|█████▏    | 1398/2700 [00:04<00:03, 337.38it/s]

networth: 0.9842


 62%|██████▏   | 1668/2700 [00:05<00:03, 292.00it/s]

networth: 0.9313


 72%|███████▏  | 1939/2700 [00:06<00:02, 317.83it/s]

networth: 0.8393


 81%|████████  | 2188/2700 [00:06<00:01, 353.57it/s]

networth: 0.7712


 93%|█████████▎| 2503/2700 [00:07<00:00, 347.84it/s]

networth: 0.6524


100%|██████████| 2700/2700 [00:08<00:00, 323.94it/s]
