In [1]:
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from trade_tester.env import TradingEnv
import pandas as pd
import numpy as np
from stable_baselines3.common.callbacks import BaseCallback
import os

In [2]:
klines = pd.read_csv('klines/DOGEUSDT_15m.csv')[['open_time', 'open', 'close', 'high', 'low', 'vol', 'trades']]
klines = klines.rename({'open_time': 'date'}, axis=1)
klines['date'] = pd.to_datetime(klines['date'], unit='ms')
klines_train = klines.iloc[len(klines)-50000: len(klines)-10000]
klines_validate = klines.iloc[len(klines)-10000:]

In [3]:
class ValidateCallback(BaseCallback):
    def __init__(self, env, verbose=0, save_path='./logs/'):
        super(ValidateCallback, self).__init__(verbose)
        self.env = env
        self.max_mean_reward =float('-inf') 
        self.save_path = save_path
        self.step = 0
        self.episodes = 0
        os.makedirs(save_path, exist_ok=True)
        os.makedirs(save_path+'best_models', exist_ok=True)
        try:
            self.df = pd.read_csv(save_path+'monitor_csv')
            self.max_mean_reward = self.df.iloc[-1, 'mean_reward']
        except:
            self.df = pd.DataFrame([], columns=['mean_reward', 'reward_std', 'file'])

    def _on_step(self) -> bool:
        """
        This method will be called by the model after each call to `env.step()`.
        """
        self.step += 1
        # check end of episode
        tester = self.model.get_env().get_attr('tester')[0] 
        if tester.n_tick+1 < tester._last_tick:
            return True

        # self.episodes += 1
        # if self.episodes % 5 != 0:
        #     return True

        # evaluate the model on separate env 
        total_rewards = []
        _episodes = 10
        for _ in range(_episodes):
            obs = self.env.reset()
            done = False
            episode_reward = 0
            while not done:
                action, _states = self.model.predict(obs)
                obs, reward, done, info = self.env.step(action)
                episode_reward += reward
            total_rewards.append(episode_reward) 
        total_rewards = np.array(total_rewards)
        mean_reward = total_rewards.mean()
        reward_std = total_rewards.std()

        if mean_reward > self.max_mean_reward:
            self.max_mean_reward = mean_reward
            name = f'best_models/ppo_{int(mean_reward)}'
            new_str = pd.DataFrame([dict(
                mean_reward=int(mean_reward),
                reward_std=int(reward_std),
                file=name,
            )])
            self.df = pd.concat([self.df, new_str], ignore_index=True)
            print(f'Find new best mean reward model with reward {mean_reward} on step {self.step} episode {self.episodes}')
            if mean_reward > 0:
                self.df.to_csv(self.save_path+'monitor.csv', index=False)
                self.model.save(self.save_path+name)

        # print(self.model.get_env().get_attr('tester')[0].n_tick)
        # print(self.model.get_env().get_attr('is_done'))
        return True

class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """

    def __init__(self, verbose=0):
        super(TensorboardCallback, self).__init__(verbose)
        self.mean_rew = 0
        self.rew_std = 0

    def _on_step(self) -> bool:
        # check end of episode
        tester = self.model.get_env().get_attr('tester')[0] 
        if tester.n_tick == tester._last_tick-1:
            rews = np.array(self.model.get_env().get_attr('total_reward'))
            rews = rews[rews > 0]
            if rews.sum() > 0:
                mean_rew = rews.mean()
                rew_std = rews.std()
                if mean_rew != 0:
                    self.mean_rew = mean_rew
                if rew_std != 0:
                    self.rew_std = rew_std

        self.logger.record('custom/ep_mean_rew', self.mean_rew)
        self.logger.record('custom/ep_rew_std', self.rew_std)
        return True

In [13]:
def make_and_learn(window, lr, gamma, timesteps=5e5, n_envs=1) -> None:
    env_kwargs = dict(
        klines=klines_train,
        window=window,
        b_size=1000,
    )
    # Separate evaluation env
    # eval_env = BinanceEnv(**env_kwargs)
    env = make_vec_env(TradingEnv, n_envs=1, env_kwargs=env_kwargs)
    # env = SubprocVecEnv([lambda: BinanceEnv(**env_kwargs) for _ in range(n_envs)])
    # callback = CustomCallback(eval_env)
    # tensorboard_callback = TensorboardCallback()
    model = DQN("MlpPolicy", env,
                tensorboard_log='tblog',
                learning_rate=lr,
                gamma=gamma,
                # buffer_size=int(5e7),
                )
    # model = PPO.load("ppo_MlpPolicy_binance", env, versbose=1, tensorboard_log='tblog')
    model.learn(total_timesteps=int(timesteps), callback=[
        # callback,
        # tensorboard_callback,
        ])
    model.save(f"logs/best_model/dqn_MlpPolicy_binance_{window}_{lr}_{gamma}")
    # print(f'obs={window}, le={lr}, gamma={gamma}')


In [14]:
# for _obs in range(100, 100, 10):
#     for _le in range(3, 3):
#         for _gamma in range(19, 19):
#             __lr = float(f'1e-{_le}')
#             __gamma = _gamma / 10

for _ in range(1):
    make_and_learn(window=100, lr=1e-5, gamma=1.8, timesteps=1e6, n_envs=1)


Path 'logs/best_model' does not exist. Will create it.



In [None]:
""" Validate """
env_kwargs = dict(
    klines=klines_validate,
    window=100,
    b_size=1000,
)
env = make_vec_env(TradingEnv, n_envs=1, env_kwargs=env_kwargs)
# model = DQN.load("logs/best_model/dqn_MlpPolicy_binance_100_1e-05_1.8")
rewards = []
for i in range(1000):
    done = False
    obs = env.reset()
    reward_total = 0
    while not done:
        # action, _states = model.predict(obs)
        obs, reward, done, info = env.step([1])
        reward_total += reward
    rewards.append(reward_total)
rewards = np.array(rewards)

In [None]:
print(f'mean_reward={rewards.mean()}, reword_std={rewards.std()}, profit_episodes: {len(rewards[rewards > 0])}, loss_episodes={len(rewards[rewards < 0])}')

In [4]:
env_kwargs = dict(
    klines=klines_validate,
    window=100,
    b_size=1000,
)
env = make_vec_env(TradingEnv, n_envs=1, env_kwargs=env_kwargs)
# model = DQN.load("logs/best_model/dqn_MlpPolicy_binance_100_1e-05_1.8")
done = False
obs = env.reset()
# action, _states = model.predict(obs)
rew = 0
while not done:
    # action, _states = model.predict(obs)
    obs, reward, done, info = env.step([0])
    rew += reward
    env.render()
print(rew)

{'balance': 1885.11324277, 'orders': 538, 'profit_orders': 241, 'loss_orders': 296, 'pnl': 885.11324277, 'pnl_percent': 88.51}
[885.11304]


In [6]:
obs.shape

(1, 100, 6)

In [20]:
env.observation_space

Box([[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0

In [13]:
env2 = make_vec_env('CartPole-v1', n_envs=1)
obs2 = env2.reset()

In [16]:
obs2.shape

(1, 4)

In [19]:
env2.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)