In [1]:
import sys 
sys.path.append('..')

import os 
import torch 
import rlcard 
from rlcard.envs.leducholdem import LeducholdemEnv
from rlcard.agents import RandomAgent, DQNAgent, FoldingAgent, CallingAgent, RaisingAgent, CheckingAgent, LeducHoldemRuleAgentBluff, LeducHoldemRuleAgentCall
from rlcard.utils import (
    get_device,
    set_seed,
    tournament,
    reorganize,
    Logger,
    plot_curve,
)

In [2]:
from dataclasses import dataclass, field, asdict
from itertools import product


def dataclass_to_grid(dc):
    """
    Converts a dictionary of arguments into a list of dictionaries where each dictionary is a unique 
    combination of arguments.
    """
    d = asdict(dc)
    keys = d.keys()
    values = (d[key] if isinstance(d[key], list) else [d[key]] for key in keys)
    return [dict(zip(keys, combination)) for combination in product(*values)]


@dataclass 
class AgentParams:
    network: str = field(default='mlp')
    mlp_layers: list = field(default_factory=lambda: [128, 128])
    learning_rate: float = field(default=0.00005)
    replay_memory_size: int = field(default=2000)
    save_every: int = field(default=1000)
    discount_factor: float = field(default=0.99)


@dataclass
class ExperimentParams:
    load_checkpoint_path: str = ""
    log_dir: str = './'
    experiment_name: str = ""
    seed: int = 0


@dataclass 
class TrainingParams:
    num_steps: int = 1000
    num_eval_games: int = 100
    evaluate_every: int = 100
    switch_opponents_every: int | float = float('inf')


In [9]:
def create_env(opponent, experiment_params, agent=None, agent_params=None) -> LeducholdemEnv:
    """
    Creates the Leduc Hold'em environment, setting the selected random seed.
    """
    set_seed(experiment_params.seed)
    env = rlcard.make('limit-holdem', config={'seed': experiment_params.seed})
    if agent is None:
        agent = create_agent(env, experiment_params, agent_params)
    env.set_agents([agent, opponent])
    return env, agent


def create_agent(
    env: LeducholdemEnv, 
    experiment_params: ExperimentParams, 
    agent_params: AgentParams,
) -> DQNAgent:
    device = get_device()
    if experiment_params.load_checkpoint_path != "":
        agent = DQNAgent.from_checkpoint(checkpoint=torch.load(experiment_params.load_checkpoint_path))
    else:
        agent = DQNAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            save_path=os.path.join(experiment_params.log_dir, experiment_params.experiment_name),
            save_every=agent_params.save_every,
            estimator_network=agent_params.network,
            mlp_layers=agent_params.mlp_layers.copy(),
            replay_memory_size=agent_params.replay_memory_size,
            learning_rate=agent_params.learning_rate,
            discount_factor=agent_params.discount_factor,
            device=device,
        )
    return agent 


def train_vs_opponent(opponent, experiment_params: ExperimentParams, training_params: TrainingParams, initial_episode: int = 0, agent_params=None, agent: DQNAgent = None):
    env, agent = create_env(opponent, experiment_params, agent_params=agent_params, agent=agent)
    tournament_results = []
    with Logger(agent.save_path) as logger:
        episode, final_episode = initial_episode, initial_episode + training_params.switch_opponents_every
        while agent.total_t <= training_params.num_steps and (episode <= final_episode):
            # Run the game, parse transitions and feed them to memory replay buffer
            trajectories, payoffs = env.run(is_training=True)
            trajectories = reorganize(trajectories, payoffs)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Maybe run evaluation tournament
            if episode % training_params.evaluate_every == 0:
                tournament_result = tournament(env, training_params.num_eval_games)[0]
                tournament_results.append({
                    'episode': episode,
                    'performance': tournament_result,
                    'opponent': opponent.__class__.__name__,
                })
                logger.log_performance(
                    episode,
                    tournament_result
                )
            episode += 1
    return tournament_results, agent, episode


from itertools import cycle


def train(opponents: list | object, experiment_params: ExperimentParams, training_params: TrainingParams, agent_params: AgentParams, agent: DQNAgent = None):
    if not isinstance(opponents, list):
        opponents = [opponents]
    opponents = cycle(opponents)
    
    # Start training
    results = []
    episode = 0
    while (agent is None) or (agent.total_t <= training_params.num_steps):
        opponent = next(opponents)
        print("Switched opponent to", opponent.__class__.__name__)
        opponent_results, agent, episode = train_vs_opponent(opponent, experiment_params, training_params, agent_params=agent_params, agent=agent, initial_episode=episode)
        results.extend(opponent_results)
    return results, agent

In [10]:
from plotly import graph_objects as go
import pandas as pd 


def plot_results(results):
    df = pd.DataFrame(results)

    # Create a color map for the opponents
    color_map = {'CallingAgent': 'red', 'RaisingAgent': 'green', 'FoldingAgent': 'blue', 'RandomAgent': 'yellow', 
                 'LeducHoldemRuleAgentCall': 'purple', 'LeducHoldemRuleAgentBluff': 'orange'}
    colors = df['opponent'].map(color_map)

    # Create the line trace with colored segments
    trace = go.Scatter(
        x=df['episode'],
        y=df['performance'],
        mode='lines+markers',
        line=dict(
            color='grey',
            width=4
        ),
        marker=dict(
            color=colors,
            size=10
        )
    )

    go.Figure(trace).show()

In [15]:
training_params = TrainingParams(switch_opponents_every=100, num_steps=50000)
agent_params = AgentParams()
experiment_params = ExperimentParams(log_dir='./multiple_agents/')
opponents = [RaisingAgent(), CheckingAgent(), FoldingAgent(), RandomAgent(4)]


opponents_to_results_raw = {}
for opponent in opponents:
    results, agent = train(opponent, experiment_params, training_params, agent_params)
    plot_results(results)
    opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}


results, agent = train(opponents, experiment_params, training_params, agent_params)
opponents_to_results_raw['all'] = {'results': results, 'agent': agent} 
plot_results(results)

Switched opponent to RaisingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.91
----------------------------------------
INFO - Step 100, rl-loss: 3.5303938388824463
INFO - Copied model parameters to target network.
INFO - Step 270, rl-loss: 10.360330581665039
----------------------------------------
  episode      |  100
  reward       |  -0.68
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RaisingAgent
INFO - Step 529, rl-loss: 7.9322690963745125
----------------------------------------
  episode      |  200
  reward       |  -0.075
----------------------------------------
INFO - Step 530, rl-loss: 2.9309959411621094
Logs saved in ./multiple_agents/
Switched opponent to RaisingAgent
INFO - Step 786, rl-loss: 2.0458080768585205
----------------------------------------
  episode      |  300
  reward       |  -0.01
----------------------------------------
INFO - Step 789, rl-lo

Switched opponent to CheckingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  -0.285
----------------------------------------
INFO - Step 100, rl-loss: 1.2304623126983643
INFO - Copied model parameters to target network.
INFO - Step 192, rl-loss: 1.7470304965972944
----------------------------------------
  episode      |  100
  reward       |  0.25
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to CheckingAgent
INFO - Step 356, rl-loss: 1.1470192670822144
----------------------------------------
  episode      |  200
  reward       |  0.21
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to CheckingAgent
INFO - Step 543, rl-loss: 1.5460480451583862
----------------------------------------
  episode      |  300
  reward       |  0.865
----------------------------------------
INFO - Step 544, rl-loss: 0.9372807145118713
Logs saved in ./mul

Switched opponent to FoldingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.745
----------------------------------------

----------------------------------------
  episode      |  100
  reward       |  0.745
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to FoldingAgent

----------------------------------------
  episode      |  200
  reward       |  0.705
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to FoldingAgent
INFO - Step 100, rl-loss: 0.8990653157234192
INFO - Copied model parameters to target network.
INFO - Step 147, rl-loss: 0.5901254415512085
----------------------------------------
  episode      |  300
  reward       |  0.525
----------------------------------------
INFO - Step 148, rl-loss: 0.6810370087623596
Logs saved in ./multiple_agents/
Switched opponent to FoldingAgent
INFO - Step 196, rl-loss: 0.56058633327484

Switched opponent to RandomAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.785
----------------------------------------
INFO - Step 100, rl-loss: 2.166247606277466
INFO - Copied model parameters to target network.
INFO - Step 138, rl-loss: 4.9260869026184085
----------------------------------------
  episode      |  100
  reward       |  0.305
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RandomAgent
INFO - Step 309, rl-loss: 2.1720485687255869
----------------------------------------
  episode      |  200
  reward       |  0.695
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RandomAgent
INFO - Step 464, rl-loss: 1.4218262434005737
----------------------------------------
  episode      |  300
  reward       |  0.93
----------------------------------------
INFO - Step 465, rl-loss: 1.6999485492706299
Logs saved in ./multiple_a

Switched opponent to RaisingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.91
----------------------------------------
INFO - Step 100, rl-loss: 3.5303938388824463
INFO - Copied model parameters to target network.
INFO - Step 270, rl-loss: 10.360330581665039
----------------------------------------
  episode      |  100
  reward       |  -0.68
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to CheckingAgent
INFO - Step 434, rl-loss: 2.6709415912628174
----------------------------------------
  episode      |  200
  reward       |  0.0
----------------------------------------
INFO - Step 435, rl-loss: 4.339032173156738
Logs saved in ./multiple_agents/
Switched opponent to FoldingAgent
INFO - Step 484, rl-loss: 0.7628552913665771
----------------------------------------
  episode      |  300
  reward       |  0.555
----------------------------------------
INFO - Step 485, rl-loss:

In [16]:
opponents_to_results = {opponent: value['results'] for opponent, value in opponents_to_results_raw.items()}
max_episode = min(value[-1]['episode'] for value in opponents_to_results.values())
opponents_to_results = {opponent: [result for result in value if result['episode'] <= max_episode] for opponent, value in opponents_to_results.items()}
data = [{'opponents': opponent, **result} for opponent, results in opponents_to_results.items() for result in results]
df = pd.DataFrame(data)

In [17]:
df.groupby(['opponents', 'opponent']).mean().drop(columns='episode')

Unnamed: 0_level_0,Unnamed: 1_level_0,performance
opponents,opponent,Unnamed: 2_level_1
CheckingAgent,CheckingAgent,1.766739
FoldingAgent,FoldingAgent,0.725174
RaisingAgent,RaisingAgent,6.112696
RandomAgent,RandomAgent,2.059609
all,CheckingAgent,1.705
all,FoldingAgent,0.727321
all,RaisingAgent,5.25371
all,RandomAgent,2.374464


In [36]:
data = []
for opponent, results in opponents_to_results.items():
    for result in results:
        data.append({**result, 'opponent': opponent})
        print({**result, 'opponent': opponent})

{'episode': 0, 'performance': 1.23, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 100, 'performance': 0.145, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 200, 'performance': 1.1, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 300, 'performance': 1.405, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 400, 'performance': 1.45, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 500, 'performance': 1.375, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 600, 'performance': 1.375, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 700, 'performance': 1.61, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 800, 'performance': 1.595, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 900, 'performance': 1.59, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 1000, 'performance': 2.08, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 1100, 'performance': 1.935, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 1200, 'performance': 1.98, 'opponen

In [39]:
df = pd.DataFrame(data)
df.groupby('opponent').median()

Unnamed: 0_level_0,episode,performance
opponent,Unnamed: 1_level_1,Unnamed: 2_level_1
FoldingAgent,3400.0,0.75
LeducHoldemRuleAgentBluff,3400.0,2.055
LeducHoldemRuleAgentCall,3400.0,0.895
all,3400.0,1.045


In [5]:
training_params = TrainingParams(switch_opponents_every=100, num_steps=100000)
agent_params = AgentParams()
experiment_params = ExperimentParams(log_dir='./multiple_agents/')
opponents = [CallingAgent(), RaisingAgent(), RandomAgent(4), FoldingAgent(), CheckingAgent()]

In [42]:
for opponent in opponents:
    results, agent = train(opponent, experiment_params, training_params, agent_params)
    plot_results(results)

results, agent = train(opponents, experiment_params, training_params, agent_params)
plot_results(results)

Switched opponent to CallingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.285
----------------------------------------
INFO - Step 100, rl-loss: 1.911059856414795
INFO - Copied model parameters to target network.
INFO - Step 165, rl-loss: 1.7441142797470093
----------------------------------------
  episode      |  100
  reward       |  0.13
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to CallingAgent
INFO - Step 322, rl-loss: 0.37976408004760746
----------------------------------------
  episode      |  200
  reward       |  0.63
----------------------------------------
INFO - Step 324, rl-loss: 0.40481364727020264
Logs saved in ./multiple_agents/
Switched opponent to CallingAgent
INFO - Step 480, rl-loss: 1.94605565071105967
----------------------------------------
  episode      |  300
  reward       |  0.42
----------------------------------------
INFO - Step 482, rl-los

Switched opponent to RaisingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  1.825
----------------------------------------
INFO - Step 100, rl-loss: 7.525012969970703
INFO - Copied model parameters to target network.
INFO - Step 195, rl-loss: 5.5315299034118654
----------------------------------------
  episode      |  100
  reward       |  0.15
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RaisingAgent
INFO - Step 378, rl-loss: 4.5337605476379395
----------------------------------------
  episode      |  200
  reward       |  0.685
----------------------------------------
INFO - Step 379, rl-loss: 4.8599042892456055
Logs saved in ./multiple_agents/
Switched opponent to RaisingAgent
INFO - Step 561, rl-loss: 1.85765826702117925
----------------------------------------
  episode      |  300
  reward       |  1.425
----------------------------------------
INFO - Step 564, rl-los

Switched opponent to RandomAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.71
----------------------------------------
INFO - Step 100, rl-loss: 1.4091770648956299
INFO - Copied model parameters to target network.
INFO - Step 133, rl-loss: 2.2652642726898193
----------------------------------------
  episode      |  100
  reward       |  -0.04
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RandomAgent
INFO - Step 266, rl-loss: 1.6030160188674927
----------------------------------------
  episode      |  200
  reward       |  1.095
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RandomAgent
INFO - Step 396, rl-loss: 0.68101942539215094
----------------------------------------
  episode      |  300
  reward       |  1.14
----------------------------------------
INFO - Step 397, rl-loss: 1.8365583419799805
Logs saved in ./multiple_

Switched opponent to FoldingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.27
----------------------------------------

----------------------------------------
  episode      |  100
  reward       |  0.16
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to FoldingAgent

----------------------------------------
  episode      |  200
  reward       |  0.235
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to FoldingAgent
INFO - Step 100, rl-loss: 0.6226616501808167
INFO - Copied model parameters to target network.
INFO - Step 145, rl-loss: 0.35948494076728826
----------------------------------------
  episode      |  300
  reward       |  0.76
----------------------------------------
INFO - Step 146, rl-loss: 0.4063968062400818
Logs saved in ./multiple_agents/
Switched opponent to FoldingAgent
INFO - Step 191, rl-loss: 0.2497247159481048

Switched opponent to CheckingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.57
----------------------------------------
INFO - Step 100, rl-loss: 2.797032594680786
INFO - Copied model parameters to target network.
INFO - Step 152, rl-loss: 1.4550185203552246
----------------------------------------
  episode      |  100
  reward       |  -0.11
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to CheckingAgent
INFO - Step 272, rl-loss: 0.77120685577392584
----------------------------------------
  episode      |  200
  reward       |  0.04
----------------------------------------
INFO - Step 276, rl-loss: 0.29054778814315796
Logs saved in ./multiple_agents/
Switched opponent to CheckingAgent
INFO - Step 395, rl-loss: 0.85471981763839726
----------------------------------------
  episode      |  300
  reward       |  0.345
----------------------------------------
INFO - Step 396, rl

Switched opponent to CallingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.285
----------------------------------------
INFO - Step 100, rl-loss: 1.911059856414795
INFO - Copied model parameters to target network.
INFO - Step 165, rl-loss: 1.7441142797470093
----------------------------------------
  episode      |  100
  reward       |  0.13
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RaisingAgent
INFO - Step 349, rl-loss: 1.79754519462585456
----------------------------------------
  episode      |  200
  reward       |  1.12
----------------------------------------
INFO - Step 350, rl-loss: 2.819002866744995
Logs saved in ./multiple_agents/
Switched opponent to RandomAgent
INFO - Step 480, rl-loss: 3.9415473937988285
----------------------------------------
  episode      |  300
  reward       |  0.945
----------------------------------------
INFO - Step 481, rl-loss: 

In [43]:
training_params = TrainingParams(switch_opponents_every=5000, num_steps=100000)
agent_params = AgentParams()
experiment_params = ExperimentParams(log_dir='./multiple_agents/5000/')
opponents = [CallingAgent(), RaisingAgent(), RandomAgent(4), FoldingAgent(), CheckingAgent()]

In [44]:
for opponent in opponents:
    results, agent = train(opponent, experiment_params, training_params, agent_params)
    plot_results(results)

results, agent = train(opponents, experiment_params, training_params, agent_params)
plot_results(results)

Switched opponent to CallingAgent


--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.285
----------------------------------------
INFO - Step 100, rl-loss: 1.911059856414795
INFO - Copied model parameters to target network.
INFO - Step 165, rl-loss: 1.7441142797470093
----------------------------------------
  episode      |  100
  reward       |  0.13
----------------------------------------
INFO - Step 330, rl-loss: 2.38386297225952156
----------------------------------------
  episode      |  200
  reward       |  0.51
----------------------------------------
INFO - Step 491, rl-loss: 2.69745635986328124
----------------------------------------
  episode      |  300
  reward       |  0.63
----------------------------------------
INFO - Step 658, rl-loss: 1.15135562419891367
----------------------------------------
  episode      |  400
  reward       |  0.46
----------------------------------------
INFO - Step 817, rl-loss: 3.69314694404602053
-------------------

Switched opponent to RaisingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  1.825
----------------------------------------
INFO - Step 100, rl-loss: 7.525012969970703
INFO - Copied model parameters to target network.
INFO - Step 195, rl-loss: 5.5315299034118654
----------------------------------------
  episode      |  100
  reward       |  0.15
----------------------------------------
INFO - Step 372, rl-loss: 2.17288494110107468
----------------------------------------
  episode      |  200
  reward       |  0.17
----------------------------------------
INFO - Step 545, rl-loss: 3.81930875778198246
----------------------------------------
  episode      |  300
  reward       |  0.98
----------------------------------------
INFO - Step 732, rl-loss: 0.17358103394508362
----------------------------------------
  episode      |  400
  reward       |  1.85
----------------------------------------
INFO - Step 904, rl-loss: 3.287

Switched opponent to RandomAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.71
----------------------------------------
INFO - Step 100, rl-loss: 1.4091770648956299
INFO - Copied model parameters to target network.
INFO - Step 133, rl-loss: 2.2652642726898193
----------------------------------------
  episode      |  100
  reward       |  -0.04
----------------------------------------
INFO - Step 252, rl-loss: 0.36430895328521733
----------------------------------------
  episode      |  200
  reward       |  0.725
----------------------------------------
INFO - Step 373, rl-loss: 1.22511029243469243
----------------------------------------
  episode      |  300
  reward       |  1.335
----------------------------------------
INFO - Step 507, rl-loss: 1.70922696590423586
----------------------------------------
  episode      |  400
  reward       |  0.865
----------------------------------------
INFO - Step 638, rl-loss: 3.

Switched opponent to FoldingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.27
----------------------------------------

----------------------------------------
  episode      |  100
  reward       |  0.16
----------------------------------------
INFO - Step 100, rl-loss: 0.7277150750160217
INFO - Copied model parameters to target network.
INFO - Step 105, rl-loss: 0.7217839956283569
----------------------------------------
  episode      |  200
  reward       |  0.13
----------------------------------------
INFO - Step 150, rl-loss: 0.39784502983093264
----------------------------------------
  episode      |  300
  reward       |  0.76
----------------------------------------
INFO - Step 205, rl-loss: 0.13889811933040624
----------------------------------------
  episode      |  400
  reward       |  0.745
----------------------------------------
INFO - Step 252, rl-loss: 0.08540394902229309
-----------------------------

Switched opponent to CheckingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.57
----------------------------------------
INFO - Step 100, rl-loss: 2.797032594680786
INFO - Copied model parameters to target network.
INFO - Step 152, rl-loss: 1.4550185203552246
----------------------------------------
  episode      |  100
  reward       |  -0.11
----------------------------------------
INFO - Step 285, rl-loss: 0.73994100093841552
----------------------------------------
  episode      |  200
  reward       |  0.395
----------------------------------------
INFO - Step 427, rl-loss: 0.90939629077911384
----------------------------------------
  episode      |  300
  reward       |  0.46
----------------------------------------
INFO - Step 566, rl-loss: 0.64119160175323496
----------------------------------------
  episode      |  400
  reward       |  0.635
----------------------------------------
INFO - Step 694, rl-loss: 0.

Switched opponent to CallingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.285
----------------------------------------
INFO - Step 100, rl-loss: 1.911059856414795
INFO - Copied model parameters to target network.
INFO - Step 165, rl-loss: 1.7441142797470093
----------------------------------------
  episode      |  100
  reward       |  0.13
----------------------------------------
INFO - Step 330, rl-loss: 2.38386297225952156
----------------------------------------
  episode      |  200
  reward       |  0.51
----------------------------------------
INFO - Step 491, rl-loss: 2.69745635986328124
----------------------------------------
  episode      |  300
  reward       |  0.63
----------------------------------------
INFO - Step 658, rl-loss: 1.15135562419891367
----------------------------------------
  episode      |  400
  reward       |  0.46
----------------------------------------
INFO - Step 817, rl-loss: 3.693