In [None]:
#| default_exp Agent.agent

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from IPython.core.debugger import set_trace

# D3rlpy Agent

>  Simulate a trading strategy using a custom football betting environment. 

In [None]:
#| export

import pandas as pd
import d3rlpy
import torch
from betting_agent.Utils.uncache import *
from betting_agent.Utils.monkey_patching import *
from d3rlpy.preprocessing.scalers import Scaler
from betting_agent.config.localconfig import CONFIG, DB_HOSTS
from betting_env.betting_env import BettingEnv
from betting_env.utils.data_extractor import *

## Load Data

In [None]:
fixtures = data_aggregator(
    db_hosts=DB_HOSTS, config=CONFIG, db_host="prod_atlas", limit=4
)
fixtures.head()

Unnamed: 0,gameId,game_optaId,gameDate,homeTeamId,homeTeam_optaId,awayTeamId,awayTeam_optaId,tgt_gd,tgt_outcome,preGameOdds1,...,homeTeamLineupIds,homeTeamLineupSlots,homeTeamFormation,home_team_lineup_received_at,awayTeamName,awayTeamLineup,awayTeamLineupIds,awayTeamLineupSlots,awayTeamFormation,away_team_lineup_received_at
0,d0cc49c3230e300b529b270951b3b70b3224481add8354...,991007,2018-08-21 18:45:00,9ca1f9a87934693b07890de4b4528b0f3ae4065a67ec38...,80,38ca605bcd29a5a37697ca66e533ae817ced71b6bf275c...,2,0,1.0,3.13,...,"[40346, 28654, 49539, 169432, 214225, 116215, ...","[1, 3, 2, 9, 6, 11, 7, 5, 4, 8, 10]",4-2-3-1,2018-08-21 18:15:00,Leeds United,"{""Kalvin Phillips"": ""DMC"", ""Jamie Shackleton"":...","[155405, 221610, 98760, 57913, 220037, 38588, ...","[4, 2, 9, 3, 1, 5, 7, 8, 10, 11, 6]",4-1-4-1,2018-08-21 18:15:00
1,c0c48eee0b1a42e0d84cb0a947fe2c64f9e1aa7015922f...,990998,2018-08-21 18:45:00,bc9d5de208258f2f95282c59e9551310be9d319ebc6e4e...,24,4a625f945d8f58984be0aa7b2ac6409a23ed9cf48e4260...,40,2,0.0,2.05,...,"[106423, 12744, 184341, 57714, 103920, 17601, ...","[4, 9, 8, 3, 10, 1, 2, 5, 6, 11, 7]",4-3-3,2018-08-21 18:15:00,Ipswich Town,"{""Jonas Knudsen"": ""DL"", ""Janoi Donacien"": ""DR""...","[82187, 154936, 101881, 115557, 28530, 19910, ...","[3, 2, 10, 7, 1, 8, 9, 11, 4, 5, 6]",4-1-4-1,2018-08-21 18:15:00
2,58b1242154c8055252582229abfc4680460278834c4433...,991001,2018-08-21 18:45:00,58301066042bbdf19de8fe7d41afc53626b5aa79034712...,72,bbb63e4ea54b0d60b48a1f8440254d7e656dfbfcbef825...,88,-1,2.0,3.09,...,"[80246, 19152, 193576, 155529, 124120, 41753, ...","[7, 6, 2, 1, 4, 3, 11, 8, 5, 9, 10]",4-5-1,2018-08-21 18:15:00,Hull City,"{""Evandro Goebel"": ""AMC"", ""Jordy De Wijs"": ""DC...","[52287, 173549, 120449, 28541, 107692, 178186,...","[10, 6, 2, 9, 11, 7, 4, 5, 3, 1, 8]",4-4-1-1,2018-08-21 18:15:00
3,3a604f5616b39eb17fc8d1eed07d5248e387bf400294b2...,991000,2018-08-21 18:45:00,e2bfbb5453a7853e049b9434db74d4d06b8c5560ff7cf9...,52,d6fe4a4ffbf1e1a0ae9d4bbed16e94042d9bf01e57eb55...,113,-3,2.0,2.44,...,"[91068, 89184, 49083, 106606, 42996, 95767, 23...","[10, 1, 8, 7, 2, 4, 11, 3, 6, 5, 9]",4-3-3,2018-08-21 18:15:00,Bristol City,"{""Josh Brownhill"": ""DMR"", ""Jack Hunt"": ""DR"", ""...","[172782, 73716, 55563, 110735, 106257, 235530,...","[4, 2, 7, 6, 10, 3, 8, 5, 11, 1, 9]",4-4-2,2018-08-21 18:15:00


### Apply Monkey-patching

In [None]:
#| export

from d3rlpy import torch_utility
from d3rlpy.online.buffers import ReplayBuffer

In [None]:
#| export
torch_utility.torch_api = torch_api
ReplayBuffer.append = append
ReplayBuffer._add_last_step = add_last_step
uncache(["d3rlpy.torch_utility","d3rlpy.online.buffers"])

## D3rlpy Agent

In [None]:
#| export

from betting_agent.Utils.scaler import CustomScaler
from betting_agent.Utils.network_architecture import *
from d3rlpy.algos import DQN
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy
from d3rlpy.models.optimizers import OptimizerFactory
from d3rlpy.preprocessing.scalers import register_scaler
from torch.optim import Adam

We propose a function that will prepare the `Reinforcement learning` algorithm prior to training. Initially, we initialise the `Betting environment` with the supplied data, then we set up the `Scaler`, which will transform our observations to particular features from the Database, and last, we set up the `Buffer`; `D3rlpy` supports both offline and online training tools. In this case, the `Buffer` will try several experiences in order to obtain a useful dataset.

Furthemore, we supply additionally an `Optimizer` to update weights and reduce losses for the `Neural Network` and an `Explorer` which will apply the `exploration-exploitation` dilemma which must exist side by side because The majority of the time, the `epsilon-greedy` strategy takes the action with the largest estimated reward. `Exploration` allows us to experiment with new ideas, which are frequently at contradiction with what we have already learned. The procedure starts with 100% `exploration` and subsequently decreases to 10%.

We should note that the `D3rlpy` package has several `RL` algorithms; in our situation, we will choose the `DQN` algorithm (Deep Q-Network).

In [None]:
#| export


def rl_algo_preparation(
    fixtures: pd.DataFrame,  # All provided games.
    algo: d3rlpy.algos = DQN,  # D3rlpy RL algorithm.
    algo_batch_size=32,  #  Mini-batch size.
    algo_learning_rate=2.5e-4,  # Algo learning rate.
    algo_target_update_interval=100,  # Interval to update the target network.
    algo_scaler: Scaler = CustomScaler,  # The scaler for data transformation.
    optimizer: torch.optim = Adam,  # Algo Optimizer.
    optimizer_weight_decay=1e-4,  # Optimizer weight decay.
    maxlen_buffer=1000000,  #  The maximum number of data length.
    explorer_start_epsilon=1.0,  # The beginning epsilon.
    explorer_end_epsilon=0.1,  # The end epsilon.
    explorer_duration=100000,  # The scheduling duration.
):
    "Prepare RL algorithm components."
    # Init betting env.
    env = BettingEnv(fixtures)

    # Init Scaler.
    register_scaler(algo_scaler)
    custom_scaler = algo_scaler()

    # Init Buffer.
    buffer = ReplayBuffer(env=env, maxlen=maxlen_buffer)

    # Init the epsilon-greedy explorer
    explorer = LinearDecayEpsilonGreedy(
        start_epsilon=explorer_start_epsilon,
        end_epsilon=explorer_end_epsilon,
        duration=explorer_duration,
    )

    # Init Optimizer.
    optim_factory = OptimizerFactory(optimizer, weight_decay=optimizer_weight_decay)

    # Init RL Algo.
    rl_algo = algo(
        batch_size=algo_batch_size,
        learning_rate=algo_learning_rate,
        target_update_interval=algo_target_update_interval,
        optim_factory=optim_factory,
        scaler=custom_scaler,
        encoder_factory=CustomEncoderFactory(feature_size=env.action_space.n),
    )

    return env, buffer, explorer, rl_algo

In [None]:
#| export

from d3rlpy.algos.base import AlgoBase

In [None]:
#| export

AlgoBase.fit_online = fit_online
uncache(["d3rlpy.torch_utility", "d3rlpy.online.buffers", "d3rlpy.algos.base"])

Launch training


In [None]:
# | export


def launch_training(
    fixtures: pd.DataFrame,  # All provided games.
    training_steps: int = 100,  # The number of total steps to train.
    n_steps_per_epoch: int = 50,  # The number of steps per epoch.
    update_start_step: int = 50,  #  The steps before starting updates.
    algo: d3rlpy.algos = DQN,  # D3rlpy RL algorithm.
    algo_batch_size: int = 32,  #  Mini-batch size.
    algo_learning_rate: float = 2.5e-4,  # Algo learning rate.
    algo_target_update_interval: int = 100,  # Interval to update the target network.
    algo_scaler: Scaler = CustomScaler,  # The scaler for data transformation.
    optimizer: torch.optim = Adam,  # Algo Optimizer.
    optimizer_weight_decay: float = 1e-4,  # Optimizer weight decay.
    maxlen_buffer: int = 1000000,  #  The maximum number of data length.
    explorer_start_epsilon: float = 1.0,  # The beginning epsilon.
    explorer_end_epsilon: float = 0.1,  # The end epsilon.
    explorer_duration: int = 100,  # The scheduling duration.
    eval_epsilon: float = 0.3,  # Greedy-epsilon for evaluation.
    show_progress: bool = True,  # Flag to show progress bar for iterations.
    save_metrics: bool = True,  # Flag to record metrics. If False, the log directory is not created and the model parameters are not saved.
):
    "Launch RL algorithm training."
    # Get algo params.
    env, buffer, explorer, rl_algo = rl_algo_preparation(
        fixtures=fixtures,
        algo=algo,
        algo_batch_size=algo_batch_size,
        algo_learning_rate=algo_learning_rate,
        algo_target_update_interval=algo_target_update_interval,
        algo_scaler=algo_scaler,
        optimizer=optimizer,
        optimizer_weight_decay=optimizer_weight_decay,
        maxlen_buffer=maxlen_buffer,
        explorer_start_epsilon=explorer_start_epsilon,
        explorer_end_epsilon=explorer_end_epsilon,
        explorer_duration=explorer_duration,
    )
    # Launch training.
    eval_env = BettingEnv(fixtures)
    rl_algo.fit_online(
        env,  # Gym environment.
        buffer,  # Buffer.
        explorer,  # Explorer.
        n_steps=training_steps,  # Train for 'training_steps' steps.
        n_steps_per_epoch=n_steps_per_epoch,  # Evaluation is performed every 'n_steps_per_epoch' steps.
        update_start_step=update_start_step,  # Parameter update starts after 'update_start_step' steps.
        save_metrics=save_metrics,  # Save metrics.
        show_progress=show_progress,  # Show progress.
        eval_env=eval_env,  # Environment for evaluation.
        eval_epsilon=eval_epsilon,  # Greedy-epsilon for evaluation.
    )

In [None]:
launch_training(
    fixtures=fixtures,
    algo=DQN,
    algo_scaler=CustomScaler,
    optimizer=Adam,
    explorer_duration=100,
    training_steps=100,
    n_steps_per_epoch=20,  
    update_start_step=20,
    save_metrics=True
)

2023-03-07 13:50.32 [info     ] Directory is created at d3rlpy_logs/DQN_online_20230307135032
2023-03-07 13:50.32 [debug    ] Fitting scaler...              scler=none
2023-03-07 13:50.32 [debug    ] Building model...
2023-03-07 13:50.32 [debug    ] Model has been built.
2023-03-07 13:50.32 [info     ] Parameters are saved to d3rlpy_logs/DQN_online_20230307135032/params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'custom', 'params': {'feature_size': 16}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 0.00025, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'weight_decay': 0.0001}, 'q_func_factory': {'type': 'mean', 'params': {'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': {'type': 'none', 'params': {}}, 'target_update_interval': 100, 'use_gpu': None, 'algorithm': 'DQN', 'observation_shape': (30,), 'action_size': 16}


  0%|          | 0/100 [00:00<?, ?it/s]

2023-03-07 13:50.52 [info     ] Model parameters are saved to d3rlpy_logs/DQN_online_20230307135032/model_20.pt
2023-03-07 13:50.52 [info     ] DQN_online_20230307135032: epoch=1 step=20 epoch=1 metrics={'time_inference': 0.3187492609024048, 'time_environment_step': 0.0008553028106689453, 'time_step': 0.6009551525115967, 'rollout_return': -86.75, 'evaluation': -2.29500000000001} step=20
2023-03-07 13:51.13 [info     ] Model parameters are saved to d3rlpy_logs/DQN_online_20230307135032/model_40.pt
2023-03-07 13:51.13 [info     ] DQN_online_20230307135032: epoch=2 step=40 epoch=2 metrics={'time_inference': 0.3378914475440979, 'time_environment_step': 0.0008766531944274902, 'time_step': 0.6662898898124695, 'rollout_return': -11.75, 'time_sample_batch': 0.00020953587123325894, 'time_algorithm_update': 0.003481898988996233, 'loss': 125.31766183035714, 'evaluation': 2.2699999999999934} step=40
2023-03-07 13:51.34 [info     ] Model parameters are saved to d3rlpy_logs/DQN_online_20230307135032

In [None]:
#| hide
from nbdev import nbdev_export

nbdev_export()