## Подготовка и среда

In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output

import sys
sys.path.append('..')

import warnings
warnings.filterwarnings('ignore')

from stonks.paper_testing.LearningEnvironment import LearningEnvironment
from stonks.paper_testing import Emulator

import abc
import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver, dynamic_episode_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

In [4]:
balance = {'usdt': 100, 'btc': 0, 'eth': 0, 'bch': 0, 'bnb': 0, 'ltc': 0}

emu = Emulator(string_start='../')

train_py_env = LearningEnvironment(emu, balance, string_start='../', test_time=3600, orderbook_depth=1)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)

## Модель

In [383]:
fc_layer_params = (128, 32)

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

W0503 00:46:30.270220 140292654888768 base_layer.py:1790] Layer QNetwork is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

W0503 00:46:30.289488 140292654888768 base_layer.py:1790] Layer TargetQNetwork is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructo

In [384]:
eval_policy = agent.policy
collect_policy = agent.collect_policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

## Replay buffer и драйвер

In [385]:
replay_buffer_capacity = 10000

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=replay_buffer_capacity)

replay_observer = [replay_buffer.add_batch]

In [386]:
batch_size = 32

dataset = replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2).prefetch(3)
    
iterator = iter(dataset)

In [387]:
train_metrics = [tf_metrics.AverageReturnMetric(), tf_metrics.EnvironmentSteps(), tf_metrics.NumberOfEpisodes()]

driver = dynamic_step_driver.DynamicStepDriver(
            train_env,
            collect_policy,
            observers=replay_observer + train_metrics,
            num_steps=1)

Тест на случайной модели:

In [393]:
train_env.reset()
try_metrics = [tf_metrics.AverageReturnMetric(), tf_metrics.EnvironmentSteps()]
try_driver = dynamic_episode_driver.DynamicEpisodeDriver(train_env, random_policy, observers=try_metrics)

_ = try_driver.run()
print(f'Изменение баланса: {try_metrics[0].result().numpy()}')
print(f'Число шагов: {try_metrics[1].result().numpy()}')

Изменение баланса: -1.7816402912139893
Число шагов: 3599


## Обучение

In [389]:
# Сразу насыпем в батч немного всего

init_driver = dynamic_episode_driver.DynamicEpisodeDriver(
            train_env,
            random_policy,
            observers=replay_observer)
_ = init_driver.run()

In [390]:
from tqdm import tqdm_notebook

num_iterations = 5000
log_interval = 3600

agent.train_step_counter.assign(0)

for metric in train_metrics:
    metric.reset()

for _ in tqdm_notebook(range(num_iterations)):
    time_step, _ = driver.run()
    experience, _ = next(iterator)
    
    train_loss = agent.train(experience)
    step = agent.train_step_counter.numpy()

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

## Сравнение со случайной моделью

In [350]:
import random

step = train_env.reset()
balances = [100.]

while not step.is_last():
    action = random_policy.action(step)
    step = train_env.step(action)
    balances.append(step.reward.numpy().item() + balances[-1])
    
plt.plot(range(len(balances)), balances)