In [None]:
#| default_exp simple_agent.agent

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from IPython.core.debugger import set_trace

# Simple Agent
>  Train a simple agent to bet on football games 

In [None]:
#| export

import d3rlpy
import pandas as pd
import torch
from betting_env.betting_env import BettingEnv
from betting_env.utils.data_extractor import *
from torch.optim import Adam

from betting_agent.config.localconfig import CONFIG, DB_HOSTS
from betting_agent.patching.monkey_patching import *
from betting_agent.patching.uncache import *

In [None]:
fixtures = data_aggregator(
    db_hosts=DB_HOSTS, config=CONFIG, db_host="prod_atlas", limit=4
)

### Apply Monkey-patching

In [None]:
#| export

from d3rlpy import torch_utility
from d3rlpy.online.buffers import ReplayBuffer

In [None]:
#| export

torch_utility.torch_api = torch_api
ReplayBuffer.append = append
ReplayBuffer._add_last_step = add_last_step
uncache(["d3rlpy.torch_utility", "d3rlpy.online.buffers"])

## Agent 

In [None]:
#| export


from d3rlpy.algos import DQN
from d3rlpy.algos.base import AlgoBase
from d3rlpy.models.optimizers import OptimizerFactory
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy
from d3rlpy.preprocessing.scalers import Scaler

AlgoBase.fit_online = fit_online

from betting_agent.simple_agent.network_architecture import *
from betting_agent.simple_agent.scaler import SimpleScaler

uncache(["d3rlpy.torch_utility", "d3rlpy.online.buffers", "d3rlpy.algos.base", "d3rlpy.preprocessing.scalers"])

We propose a function that will prepare the `Reinforcement learning` algorithm prior to training. Initially, we initialise the `Betting environment` with the supplied data, then we set up the `Scaler`, which will transform our observations to particular features from the Database, and last, we set up the `Buffer`; `D3rlpy` supports both offline and online training tools. In this case, the `Buffer` will try several experiences in order to obtain a useful dataset.

Furthemore, we supply additionally an `Optimizer` to update weights and reduce losses for the `Neural Network` and an `Explorer` which will apply the `exploration-exploitation` dilemma which must exist side by side because The majority of the time, the `epsilon-greedy` strategy takes the action with the largest estimated reward. `Exploration` allows us to experiment with new ideas, which are frequently at contradiction with what we have already learned. The procedure starts with 100% `exploration` and subsequently decreases to 10%.

We should note that the `D3rlpy` package has several `RL` algorithms; in our situation, we will choose the `DQN` algorithm (Deep Q-Network).

In [None]:
#| export


def rl_algo_preparation(
    fixtures: pd.DataFrame,  # All provided games.
    algo: d3rlpy.algos,  # D3rlpy RL algorithm.
    algo_batch_size=32,  #  Mini-batch size.
    algo_learning_rate=2.5e-4,  # Algo learning rate.
    algo_target_update_interval=100,  # Interval to update the target network.
    algo_scaler: str = "simple", # name of the scaler to apply
    optimizer: torch.optim = Adam,  # Algo Optimizer.
    optimizer_weight_decay=1e-4,  # Optimizer weight decay.
    maxlen_buffer=1000000,  #  The maximum number of data length.
    explorer_start_epsilon=1.0,  # The beginning epsilon.
    explorer_end_epsilon=0.1,  # The end epsilon.
    explorer_duration=100000,  # The scheduling duration.
    **kwargs,  # extra arguments accepted by `SimpleEncoderFactory()`
):
    "Prepare RL algorithm components."
    # Init betting env.
    env = BettingEnv(fixtures)
    
    # scaler
    scaler = SimpleScaler()

    # Init Buffer.
    buffer = ReplayBuffer(env=env, maxlen=maxlen_buffer)

    # Init the epsilon-greedy explorer
    explorer = LinearDecayEpsilonGreedy(
        start_epsilon=explorer_start_epsilon,
        end_epsilon=explorer_end_epsilon,
        duration=explorer_duration,
    )

    # Init Optimizer.
    optim_factory = OptimizerFactory(optimizer, weight_decay=optimizer_weight_decay)

    # Init RL Algo.
    rl_algo = algo(
        batch_size=algo_batch_size,
        learning_rate=algo_learning_rate,
        target_update_interval=algo_target_update_interval,
        optim_factory=optim_factory,
        scaler=scaler,
        encoder_factory=SimpleEncoderFactory(
            feature_size=env.action_space.n,
            observation_size=scaler.OBS,
            **kwargs,
        ),
    )

    return env, buffer, explorer, rl_algo

In [None]:
# | export


def launch_training(
    fixtures: pd.DataFrame,  # All provided games.
    algo: d3rlpy.algos,  # D3rlpy RL algorithm.
    training_steps: int = 100,  # The number of total steps to train.
    n_steps_per_epoch: int = 50,  # The number of steps per epoch.
    update_start_step: int = 50,  #  The steps before starting updates.
    algo_batch_size: int = 32,  #  Mini-batch size.
    algo_learning_rate: float = 2.5e-4,  # Algo learning rate.
    algo_target_update_interval: int = 100,  # Interval to update the target network.
    algo_scaler: Scaler = SimpleScaler,  # The scaler for data transformation.
    optimizer: torch.optim = Adam,  # Algo Optimizer.
    optimizer_weight_decay: float = 1e-4,  # Optimizer weight decay.
    maxlen_buffer: int = 1000000,  #  The maximum number of data length.
    explorer_start_epsilon: float = 1.0,  # The beginning epsilon.
    explorer_end_epsilon: float = 0.1,  # The end epsilon.
    explorer_duration: int = 100,  # The scheduling duration.
    eval_epsilon: float = 0.3,  # Greedy-epsilon for evaluation.
    show_progress: bool = True,  # Flag to show progress bar for iterations.
    save_metrics: bool = True,  # Flag to record metrics. If False, the log directory is not created and the model parameters are not saved.
):
    "Launch RL algorithm training."
    # Get algo params.
    env, buffer, explorer, rl_algo = rl_algo_preparation(
        fixtures=fixtures,
        algo=algo,
        algo_batch_size=algo_batch_size,
        algo_learning_rate=algo_learning_rate,
        algo_target_update_interval=algo_target_update_interval,
        algo_scaler=algo_scaler,
        optimizer=optimizer,
        optimizer_weight_decay=optimizer_weight_decay,
        maxlen_buffer=maxlen_buffer,
        explorer_start_epsilon=explorer_start_epsilon,
        explorer_end_epsilon=explorer_end_epsilon,
        explorer_duration=explorer_duration,
    )
    # Launch training.
    eval_env = BettingEnv(fixtures)
    rl_algo.fit_online(
        env,  # Gym environment.
        buffer,  # Buffer.
        explorer,  # Explorer.
        n_steps=training_steps,  # Train for 'training_steps' steps.
        n_steps_per_epoch=n_steps_per_epoch,  # Evaluation is performed every 'n_steps_per_epoch' steps.
        update_start_step=update_start_step,  # Parameter update starts after 'update_start_step' steps.
        save_metrics=save_metrics,  # Save metrics.
        show_progress=show_progress,  # Show progress.
        eval_env=eval_env,  # Environment for evaluation.
        eval_epsilon=eval_epsilon,  # Greedy-epsilon for evaluation.
    )

In [None]:


launch_training(
    fixtures=fixtures,
    algo=DQN,
    algo_scaler=SimpleScaler,
    optimizer=Adam,
    explorer_duration=100,
    training_steps=100,
    n_steps_per_epoch=20,
    update_start_step=20,
    save_metrics=True,
)

2023-03-10 07:45.14 [info     ] Directory is created at d3rlpy_logs/DQN_online_20230310074514
2023-03-10 07:45.14 [debug    ] Fitting scaler...              scler=simple
2023-03-10 07:45.14 [debug    ] Building model...
2023-03-10 07:45.14 [debug    ] Model has been built.
2023-03-10 07:45.14 [info     ] Parameters are saved to d3rlpy_logs/DQN_online_20230310074514/params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'custom', 'params': {'feature_size': 16}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 0.00025, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'weight_decay': 0.0001}, 'q_func_factory': {'type': 'mean', 'params': {'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': {'type': 'simple', 'params': {}}, 'target_update_interval': 100, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (30,), 'action_size': 16}


  0%|          | 0/100 [00:00<?, ?it/s]

2023-03-10 07:45.23 [info     ] Model parameters are saved to d3rlpy_logs/DQN_online_20230310074514/model_20.pt
2023-03-10 07:45.23 [info     ] DQN_online_20230310074514: epoch=1 step=20 epoch=1 metrics={'time_inference': 0.1550007939338684, 'time_environment_step': 0.0003853321075439453, 'time_step': 0.267804217338562, 'rollout_return': -50.48, 'evaluation': 8.935000000000006} step=20
2023-03-10 07:45.30 [info     ] Model parameters are saved to d3rlpy_logs/DQN_online_20230310074514/model_40.pt
2023-03-10 07:45.30 [info     ] DQN_online_20230310074514: epoch=2 step=40 epoch=2 metrics={'time_inference': 0.10758460760116577, 'time_environment_step': 0.00040752887725830077, 'time_step': 0.21772069931030275, 'rollout_return': 20.270000000000003, 'time_sample_batch': 0.00010214533124651228, 'time_algorithm_update': 0.004025254930768695, 'loss': 54.871473039899556, 'evaluation': -42.795} step=40
2023-03-10 07:45.38 [info     ] Model parameters are saved to d3rlpy_logs/DQN_online_20230310074

In [None]:
#| hide
from nbdev import nbdev_export

nbdev_export()