In [1]:
import sys 
sys.path.append('..')

import os 
import torch 
import rlcard 
from rlcard.envs.leducholdem import LeducholdemEnv
from rlcard.agents import RandomAgent, DQNAgent, FoldingAgent, CallingAgent, RaisingAgent, CheckingAgent, LeducHoldemRuleAgentBluff, LeducHoldemRuleAgentCall, NFSPAgent
from rlcard.utils import (
    get_device,
    set_seed,
    tournament,
    reorganize,
    Logger,
    plot_curve,
)

In [2]:
from dataclasses import dataclass, field, asdict
from itertools import product


def dataclass_to_grid(dc):
    """
    Converts a dictionary of arguments into a list of dictionaries where each dictionary is a unique 
    combination of arguments.
    """
    d = asdict(dc)
    keys = d.keys()
    values = (d[key] if isinstance(d[key], list) else [d[key]] for key in keys)
    return [dict(zip(keys, combination)) for combination in product(*values)]


@dataclass 
class AgentParams:
    name: str = 'dqn'
    network: str = field(default='mlp')
    mlp_layers: list = field(default_factory=lambda: [128, 128])
    learning_rate: float = field(default=0.00005)
    replay_memory_size: int = field(default=100000)
    save_every: int = field(default=1000)
    discount_factor: float = field(default=0.99)
    memory_sequence_length: int = field(default=128)
    d_model: int = 128
    num_layers: int = 2 
    rl_learning_rate: float = 0.000001
    sl_learning_rate: float = 0.0003
    anticipatory_param: float = 0.1

    def __post_init__(self):
        if self.name == 'dqn':
            self.mlp_layers = [128, 128]
            self.d_model = 128
        if self.name == 'nfsp':
            self.mlp_layers = [512, 512]
            self.d_model = 512
        self.q_mlp_layers: list[int] = self.mlp_layers
        self.q_replay_memory_size: int = self.replay_memory_size
        self.q_discount_factor: float = self.discount_factor


@dataclass
class ExperimentParams:
    load_checkpoint_path: str = ""
    log_dir: str = './'
    experiment_name: str = ""
    seed: int = 0


@dataclass 
class TrainingParams:
    num_steps: int = 1000
    num_eval_games: int = 100
    evaluate_every: int = 100
    switch_opponents_every: int | float = float('inf')


In [17]:
def create_env(opponent, experiment_params, agent=None, agent_params=None) -> LeducholdemEnv:
    """
    Creates the Leduc Hold'em environment, setting the selected random seed.
    """
    set_seed(experiment_params.seed)
    env = rlcard.make('leduc-holdem', config={'seed': experiment_params.seed})
    if agent is None:
        agent = create_agent(env, experiment_params, agent_params)
    env.set_agents([agent, opponent])
    return env, agent


def create_agent(
    env: LeducholdemEnv, 
    experiment_params: ExperimentParams, 
    agent_params: AgentParams,
) -> DQNAgent:
    device = get_device()
    print(agent_params.name)
    if agent_params.name == 'dqn':
        if experiment_params.load_checkpoint_path != "":
            agent = DQNAgent.from_checkpoint(checkpoint=torch.load(experiment_params.load_checkpoint_path))
            agent.save_path = os.path.join(experiment_params.log_dir, experiment_params.experiment_name)
        else:
            agent = DQNAgent(
                num_actions=env.num_actions,
                state_shape=env.state_shape[0],
                save_path=os.path.join(experiment_params.log_dir, experiment_params.experiment_name),
                save_every=agent_params.save_every,
                estimator_network=agent_params.network,
                mlp_layers=agent_params.mlp_layers.copy(),
                replay_memory_size=agent_params.replay_memory_size,
                learning_rate=agent_params.learning_rate,
                discount_factor=agent_params.discount_factor,
                device=device,
                memory_sequence_length=agent_params.memory_sequence_length,
            )
    elif agent_params.name == 'nfsp':
        if experiment_params.load_checkpoint_path != "":
            agent = NFSPAgent.from_checkpoint(checkpoint=torch.load(experiment_params.load_checkpoint_path))
            agent.save_path = os.path.join(experiment_params.log_dir, experiment_params.experiment_name)
        else:
            agent = NFSPAgent(
                num_actions=env.num_actions,
                state_shape=env.state_shape[0],
                device=device,
                save_path=os.path.join(experiment_params.log_dir, experiment_params.experiment_name),
                save_every=agent_params.save_every,
                estimator_network=agent_params.network,
                q_mlp_layers=agent_params.q_mlp_layers,
                q_replay_memory_size=agent_params.q_replay_memory_size,
                q_discount_factor=agent_params.q_discount_factor,
                rl_learning_rate=agent_params.rl_learning_rate,
                sl_learning_rate=agent_params.sl_learning_rate,
                anticipatory_param=agent_params.anticipatory_param,
                memory_sequence_length=agent_params.memory_sequence_length,
            )
    return agent 


def train_vs_opponent(opponent, experiment_params: ExperimentParams, training_params: TrainingParams, initial_episode: int = 0, agent_params=None, agent: DQNAgent = None):
    env, agent = create_env(opponent, experiment_params, agent_params=agent_params, agent=agent)
    tournament_results = []
    with Logger(agent.save_path) as logger:
        episode, final_episode = initial_episode, initial_episode + training_params.switch_opponents_every
        while agent.total_t <= training_params.num_steps and (episode <= final_episode):
            if isinstance(agent, NFSPAgent):
                agent.sample_episode_policy()

            # Run the game, parse transitions and feed them to memory replay buffer
            trajectories, payoffs = env.run(is_training=True)
            trajectories = reorganize(trajectories, payoffs)
            for ts in trajectories[0]:
                # print(f"Trajectories {ts}")
                agent.feed(ts)

            if episode % training_params.evaluate_every == 0:
                tournament_result = tournament(env, training_params.num_eval_games)[0]
                tournament_results.append({
                    'episode': episode,
                    'performance': tournament_result,
                    'opponent': opponent.__class__.__name__,
                })
                logger.log_performance(
                    episode,
                    tournament_result
                )
            episode += 1
    return tournament_results, agent, episode


from itertools import cycle


def train(opponents: list | object, experiment_params: ExperimentParams, training_params: TrainingParams, agent_params: AgentParams, agent: DQNAgent = None):
    if not isinstance(opponents, list):
        opponents = [opponents]
    opponents = cycle(opponents)
    
    # Start training
    results = []
    episode = 0
    while (agent is None) or (agent.total_t <= training_params.num_steps):
        opponent = next(opponents)
        print("Switched opponent to", opponent.__class__.__name__)
        opponent_results, agent, episode = train_vs_opponent(opponent, experiment_params, training_params, agent_params=agent_params, agent=agent, initial_episode=episode)
        results.extend(opponent_results)
    return results, agent

In [7]:
from plotly import graph_objects as go
import pandas as pd 


def plot_results(results):
    df = pd.DataFrame(results)

    # Create a color map for the opponents
    color_map = {'CallingAgent': 'red', 'RaisingAgent': 'green', 'FoldingAgent': 'blue', 'RandomAgent': 'yellow', 
                 'LeducHoldemRuleAgentCall': 'purple', 'LeducHoldemRuleAgentBluff': 'orange'}
    colors = df['opponent'].map(color_map)

    # Create the line trace with colored segments
    trace = go.Scatter(
        x=df['episode'],
        y=df['performance'],
        mode='lines+markers',
        line=dict(
            color='grey',
            width=4
        ),
        marker=dict(
            color=colors,
            size=10
        )
    )

    go.Figure(trace).show()

In [25]:
training_params = TrainingParams(switch_opponents_every=float('inf'), num_steps=25000, num_eval_games=100)
agent_params = AgentParams(name='dqn', network='transformer', memory_sequence_length=10, learning_rate=0.00005)
experiment_params = ExperimentParams(log_dir=f'./experiments/random_agent/{agent_params.name}/{agent_params.network}-memory_sequence_length=1')
opponents = [RandomAgent(4)]


opponents_to_results_raw = {}
for opponent in opponents:
    env, agent = create_env(opponent, experiment_params, agent_params=agent_params)
    results, agent = train(opponent, experiment_params, training_params, agent_params, agent=agent)
    plot_results(results)
    opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}

plot_results(results)

--> Running on the CPU
dqn
Switched opponent to RandomAgent

----------------------------------------
  episode      |  0
  reward       |  0.51
----------------------------------------
INFO - Step 100, rl-loss: 3.0630455017089844
INFO - Copied model parameters to target network.
INFO - Step 140, rl-loss: 2.2785181999206543
----------------------------------------
  episode      |  100
  reward       |  1.1035
----------------------------------------
INFO - Step 273, rl-loss: 1.5917124748229984
----------------------------------------
  episode      |  200
  reward       |  1.19475
----------------------------------------
INFO - Step 400, rl-loss: 2.41566252708435066
----------------------------------------
  episode      |  300
  reward       |  1.059
----------------------------------------
INFO - Step 529, rl-loss: 3.45024013519287185
----------------------------------------
  episode      |  400
  reward       |  1.1625
----------------------------------------
INFO - Step 674, rl-l

KeyboardInterrupt: 

In [23]:
training_params = TrainingParams(switch_opponents_every=float('inf'), num_steps=25000, num_eval_games=2000)
agent_params = AgentParams(name='dqn', network='transformer', memory_sequence_length=1)
experiment_params = ExperimentParams(log_dir=f'./experiments/random_agent/{agent_params.name}/{agent_params.network}-memory_sequence_length=1')
opponents = [RandomAgent(4)]


opponents_to_results_raw = {}
for opponent in opponents:
    env, agent = create_env(opponent, experiment_params, agent_params=agent_params)
    results, agent = train(opponent, experiment_params, training_params, agent_params, agent=agent)
    plot_results(results)
    opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}

plot_results(results)

--> Running on the CPU
dqn
Switched opponent to RandomAgent

----------------------------------------
  episode      |  0
  reward       |  0.51
----------------------------------------
INFO - Step 100, rl-loss: 3.2586095333099365
INFO - Copied model parameters to target network.
INFO - Step 140, rl-loss: 1.2629560232162476
----------------------------------------
  episode      |  100
  reward       |  0.92
----------------------------------------
INFO - Step 258, rl-loss: 1.2598267793655396
----------------------------------------
  episode      |  200
  reward       |  0.79225
----------------------------------------
INFO - Step 400, rl-loss: 3.1520383358001714
----------------------------------------
  episode      |  300
  reward       |  1.07325
----------------------------------------
INFO - Step 537, rl-loss: 2.1622893810272217
----------------------------------------
  episode      |  400
  reward       |  1.03425
----------------------------------------
INFO - Step 663, rl-lo

In [21]:
training_params = TrainingParams(switch_opponents_every=float('inf'), num_steps=50000, num_eval_games=100)
agent_params = AgentParams(name='dqn', network='transformer', memory_sequence_length=10)
experiment_params = ExperimentParams(log_dir=f'./experiments/random_agent/{agent_params.name}/{agent_params.network}-memory_sequence_length=128')
opponents = [RandomAgent(4)]


opponents_to_results_raw = {}
for opponent in opponents:
    env, agent = create_env(opponent, experiment_params, agent_params=agent_params)
    results, agent = train(opponent, experiment_params, training_params, agent_params, agent=agent)
    plot_results(results)
    opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}

plot_results(results)

--> Running on the CPU
dqn
Switched opponent to RandomAgent

----------------------------------------
  episode      |  0
  reward       |  0.51
----------------------------------------
INFO - Step 100, rl-loss: 2.45314359664917
INFO - Copied model parameters to target network.
INFO - Step 140, rl-loss: 2.1811897754669197
----------------------------------------
  episode      |  100
  reward       |  1.08625
----------------------------------------
INFO - Step 265, rl-loss: 1.2861149311065674
----------------------------------------
  episode      |  200
  reward       |  1.229
----------------------------------------
INFO - Step 392, rl-loss: 2.0087811946868896
----------------------------------------
  episode      |  300
  reward       |  1.085
----------------------------------------
INFO - Step 532, rl-loss: 2.01113677024841313
----------------------------------------
  episode      |  400
  reward       |  1.191
----------------------------------------
INFO - Step 667, rl-loss: 

In [22]:
training_params = TrainingParams(switch_opponents_every=float('inf'), num_steps=25000, num_eval_games=2000)
agent_params = AgentParams(name='dqn', network='mlp', memory_sequence_length=10)
experiment_params = ExperimentParams(log_dir=f'./experiments/random_agent/{agent_params.name}/{agent_params.network}-memory_sequence_length=128')
opponents = [RandomAgent(4)]


opponents_to_results_raw = {}
for opponent in opponents:
    env, agent = create_env(opponent, experiment_params, agent_params=agent_params)
    results, agent = train(opponent, experiment_params, training_params, agent_params, agent=agent)
    plot_results(results)
    opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}

plot_results(results)

--> Running on the CPU
dqn
Switched opponent to RandomAgent

----------------------------------------
  episode      |  0
  reward       |  0.38075
----------------------------------------
INFO - Step 100, rl-loss: 2.255739688873291
INFO - Copied model parameters to target network.
INFO - Step 128, rl-loss: 2.9989149570465093
----------------------------------------
  episode      |  100
  reward       |  0.38475
----------------------------------------
INFO - Step 240, rl-loss: 1.1527091264724731
----------------------------------------
  episode      |  200
  reward       |  0.90725
----------------------------------------
INFO - Step 394, rl-loss: 1.21662855148315436
----------------------------------------
  episode      |  300
  reward       |  0.85325
----------------------------------------
INFO - Step 521, rl-loss: 2.83578824996948245
----------------------------------------
  episode      |  400
  reward       |  0.98475
----------------------------------------
INFO - Step 640

In [34]:
training_params = TrainingParams(switch_opponents_every=float('inf'), num_steps=50000)
agent_params = AgentParams(name='nfsp', network='mlp')

experiment_params = ExperimentParams(log_dir=f'./experiments/random_agent/{agent_params.name}/{agent_params.network}')
opponents = [RandomAgent(4)]


opponents_to_results_raw = {}
for opponent in opponents:
    results, agent = train(opponent, experiment_params, training_params, agent_params)
    plot_results(results)
    opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}

plot_results(results)

Switched opponent to RandomAgent
--> Running on the CPU
nfsp

----------------------------------------
  episode      |  0
  reward       |  -0.31
----------------------------------------
INFO - Step 100, rl-loss: 2.9751217365264893
INFO - Copied model parameters to target network.
INFO - Step 134, rl-loss: 1.9677889347076416
----------------------------------------
  episode      |  100
  reward       |  0.04
----------------------------------------
INFO - Step 264, rl-loss: 2.6966521739959717
----------------------------------------
  episode      |  200
  reward       |  -0.105
----------------------------------------
INFO - Step 394, rl-loss: 2.1696228981018066
----------------------------------------
  episode      |  300
  reward       |  -0.255
----------------------------------------
INFO - Step 518, rl-loss: 1.52176237106323245
----------------------------------------
  episode      |  400
  reward       |  -0.05
----------------------------------------
INFO - Step 636, rl-los

In [26]:
training_params = TrainingParams(switch_opponents_every=100, num_steps=50000)
agent_params = AgentParams(name='dqn', network='transformer', memory_sequence_length=10)
experiment_params = ExperimentParams(log_dir=f'./experiments/random_agent/{agent_params.name}/{agent_params.network}-memory_sequence_length=128')
opponents = [LeducHoldemRuleAgentBluff(), LeducHoldemRuleAgentCall()]

# opponents_to_results_raw = {}
# for opponent in opponents:
#     experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/{opponent.__class__.__name__}/{agent_params.name}/{agent_params.network}')
#     results, agent = train(opponent, experiment_params, training_params, agent_params)
#     plot_results(results)
#     opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}

experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}')
results, agent = train(opponents, experiment_params, training_params, agent_params)

plot_results(results)

Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
dqn

----------------------------------------
  episode      |  0
  reward       |  0.79
----------------------------------------
INFO - Step 100, rl-loss: 3.1264395713806152
INFO - Copied model parameters to target network.
INFO - Step 152, rl-loss: 1.6103667020797736
----------------------------------------
  episode      |  100
  reward       |  1.56
----------------------------------------

Logs saved in ./experiments/bluff_call/all/dqn/transformer/
Switched opponent to LeducHoldemRuleAgentCall
INFO - Step 235, rl-loss: 1.4135994911193848
----------------------------------------
  episode      |  200
  reward       |  0.57
----------------------------------------

Logs saved in ./experiments/bluff_call/all/dqn/transformer/
Switched opponent to LeducHoldemRuleAgentBluff
INFO - Step 367, rl-loss: 2.4651386737823486
----------------------------------------
  episode      |  300
  reward       |  1.83
---------------

In [36]:
df = pd.DataFrame(results).drop(columns='opponent').rename(columns={'performance': 'reward'}).to_csv(f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}/performance.csv', index=False)

In [34]:
training_params = TrainingParams(switch_opponents_every=100, num_steps=50000)
agent_params = AgentParams(name='nfsp', network='mlp', memory_sequence_length=10)
experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}-memory_sequence_length=128')
opponents = [LeducHoldemRuleAgentBluff(), LeducHoldemRuleAgentCall()]

# opponents_to_results_raw = {}
# for opponent in opponents:
#     experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/{opponent.__class__.__name__}/{agent_params.name}/{agent_params.network}')
#     results, agent = train(opponent, experiment_params, training_params, agent_params)
#     plot_results(results)
#     opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}

experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}')
results, agent = train(opponents, experiment_params, training_params, agent_params)

plot_results(results)

Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
nfsp

----------------------------------------
  episode      |  0
  reward       |  -0.115
----------------------------------------
INFO - Step 100, rl-loss: 3.0089609622955322
INFO - Copied model parameters to target network.
INFO - Step 133, rl-loss: 3.1791982650756836
----------------------------------------
  episode      |  100
  reward       |  -0.035
----------------------------------------

Logs saved in ./experiments/bluff_call/all/nfsp/mlp/
Switched opponent to LeducHoldemRuleAgentCall
INFO - Step 227, rl-loss: 1.8643547296524048
----------------------------------------
  episode      |  200
  reward       |  0.095
----------------------------------------
INFO - Step 228, rl-loss: 0.9850258827209473
Logs saved in ./experiments/bluff_call/all/nfsp/mlp/
Switched opponent to LeducHoldemRuleAgentBluff
INFO - Step 362, rl-loss: 0.9610283374786377
----------------------------------------
  episode      |  300
  

In [38]:
training_params = TrainingParams(switch_opponents_every=100, num_steps=50000)
agent_params = AgentParams(name='dqn', network='mlp', memory_sequence_length=10)
experiment_dir = f"./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}"
performance_path = os.path.join(experiment_dir, 'performance.csv')
experiment_params = ExperimentParams(log_dir=experiment_dir)
opponents = [LeducHoldemRuleAgentBluff(), LeducHoldemRuleAgentCall()]

experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}')
results, agent = train(opponents, experiment_params, training_params, agent_params)
plot_results(results)

pd.DataFrame(results).rename(columns={'performance': 'reward'}).to_csv(performance_path, index=False)

Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
dqn

----------------------------------------
  episode      |  0
  reward       |  1.23
----------------------------------------
INFO - Step 100, rl-loss: 2.9780619144439697
INFO - Copied model parameters to target network.
INFO - Step 145, rl-loss: 2.4218301773071298
----------------------------------------
  episode      |  100
  reward       |  0.145
----------------------------------------

Logs saved in ./experiments/bluff_call/all/dqn/mlp/
Switched opponent to LeducHoldemRuleAgentCall
INFO - Step 248, rl-loss: 1.0711170434951782
----------------------------------------
  episode      |  200
  reward       |  0.585
----------------------------------------
INFO - Step 250, rl-loss: 1.6387248039245605
Logs saved in ./experiments/bluff_call/all/dqn/mlp/
Switched opponent to LeducHoldemRuleAgentBluff
INFO - Step 394, rl-loss: 3.60998749732971216
----------------------------------------
  episode      |  300
  rewar

KeyboardInterrupt: 

In [40]:
training_params = TrainingParams(switch_opponents_every=100, num_steps=50000)
agent_params = AgentParams(name='dqn', network='mlp', memory_sequence_length=10)
experiment_dir = f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}'
performance_path = os.path.join(experiment_dir, 'performance.csv')

experiment_params = ExperimentParams(log_dir=experiment_dir)
opponents = [LeducHoldemRuleAgentBluff(), LeducHoldemRuleAgentCall()]
results, agent = train(opponents, experiment_params, training_params, agent_params)
plot_results(results)

pd.DataFrame(results).rename(columns={'performance': 'reward'}).to_csv(performance_path, index=False)


Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
dqn

----------------------------------------
  episode      |  0
  reward       |  1.23
----------------------------------------
INFO - Step 100, rl-loss: 2.9780619144439697
INFO - Copied model parameters to target network.
INFO - Step 145, rl-loss: 2.4218301773071298
----------------------------------------
  episode      |  100
  reward       |  0.145
----------------------------------------

Logs saved in ./experiments/bluff_call/all/dqn/mlp/
Switched opponent to LeducHoldemRuleAgentCall
INFO - Step 248, rl-loss: 1.0711170434951782
----------------------------------------
  episode      |  200
  reward       |  0.585
----------------------------------------
INFO - Step 250, rl-loss: 1.6387248039245605
Logs saved in ./experiments/bluff_call/all/dqn/mlp/
Switched opponent to LeducHoldemRuleAgentBluff
INFO - Step 394, rl-loss: 3.60998749732971216
----------------------------------------
  episode      |  300
  rewar

In [41]:
training_params = TrainingParams(switch_opponents_every=100, num_steps=50000)
agent_params = AgentParams(name='dqn', network='transformer', memory_sequence_length=10)
experiment_dir = f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}'
performance_path = os.path.join(experiment_dir, 'performance.csv')

experiment_params = ExperimentParams(log_dir=experiment_dir)
opponents = [LeducHoldemRuleAgentBluff(), LeducHoldemRuleAgentCall()]
results, agent = train(opponents, experiment_params, training_params, agent_params)
plot_results(results)

pd.DataFrame(results).rename(columns={'performance': 'reward'}).to_csv(performance_path, index=False)


Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
dqn

----------------------------------------
  episode      |  0
  reward       |  0.79
----------------------------------------
INFO - Step 100, rl-loss: 3.1264395713806152
INFO - Copied model parameters to target network.
INFO - Step 152, rl-loss: 1.6103667020797736
----------------------------------------
  episode      |  100
  reward       |  1.56
----------------------------------------

Logs saved in ./experiments/bluff_call/all/dqn/transformer/
Switched opponent to LeducHoldemRuleAgentCall
INFO - Step 235, rl-loss: 1.4135994911193848
----------------------------------------
  episode      |  200
  reward       |  0.57
----------------------------------------

Logs saved in ./experiments/bluff_call/all/dqn/transformer/
Switched opponent to LeducHoldemRuleAgentBluff
INFO - Step 367, rl-loss: 2.4651386737823486
----------------------------------------
  episode      |  300
  reward       |  1.83
---------------

In [42]:
training_params = TrainingParams(switch_opponents_every=100, num_steps=50000)
agent_params = AgentParams(name='nfsp', network='transformer', memory_sequence_length=10)
experiment_dir = f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}'
performance_path = os.path.join(experiment_dir, 'performance.csv')

experiment_params = ExperimentParams(log_dir=experiment_dir)
opponents = [LeducHoldemRuleAgentBluff(), LeducHoldemRuleAgentCall()]
results, agent = train(opponents, experiment_params, training_params, agent_params)
plot_results(results)

pd.DataFrame(results).rename(columns={'performance': 'reward'}).to_csv(performance_path, index=False)


Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
nfsp

----------------------------------------
  episode      |  0
  reward       |  0.39
----------------------------------------
INFO - Step 100, rl-loss: 1.89412260055542
INFO - Copied model parameters to target network.
INFO - Step 159, rl-loss: 7.5173120498657235
----------------------------------------
  episode      |  100
  reward       |  0.2
----------------------------------------

Logs saved in ./experiments/bluff_call/all/nfsp/transformer/
Switched opponent to LeducHoldemRuleAgentCall
INFO - Step 245, rl-loss: 3.1759548187255864
----------------------------------------
  episode      |  200
  reward       |  0.185
----------------------------------------
INFO - Step 246, rl-loss: 3.6275336742401123
Logs saved in ./experiments/bluff_call/all/nfsp/transformer/
Switched opponent to LeducHoldemRuleAgentBluff
INFO - Step 374, rl-loss: 3.1991114616394043
----------------------------------------
  episode      

In [43]:
training_params = TrainingParams(switch_opponents_every=100, num_steps=50000)
agent_params = AgentParams(name='nfsp', network='mlp', memory_sequence_length=10)
experiment_dir = f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}'
performance_path = os.path.join(experiment_dir, 'performance.csv')

experiment_params = ExperimentParams(log_dir=experiment_dir)
opponents = [LeducHoldemRuleAgentBluff(), LeducHoldemRuleAgentCall()]
results, agent = train(opponents, experiment_params, training_params, agent_params)
plot_results(results)

pd.DataFrame(results).rename(columns={'performance': 'reward'}).to_csv(performance_path, index=False)


Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
nfsp

----------------------------------------
  episode      |  0
  reward       |  -0.115
----------------------------------------
INFO - Step 100, rl-loss: 3.0089609622955322
INFO - Copied model parameters to target network.
INFO - Step 133, rl-loss: 3.1791982650756836
----------------------------------------
  episode      |  100
  reward       |  -0.035
----------------------------------------

Logs saved in ./experiments/bluff_call/all/nfsp/mlp/
Switched opponent to LeducHoldemRuleAgentCall
INFO - Step 227, rl-loss: 1.8643547296524048
----------------------------------------
  episode      |  200
  reward       |  0.095
----------------------------------------
INFO - Step 228, rl-loss: 0.9850258827209473
Logs saved in ./experiments/bluff_call/all/nfsp/mlp/
Switched opponent to LeducHoldemRuleAgentBluff
INFO - Step 362, rl-loss: 0.9610283374786377
----------------------------------------
  episode      |  300
  

In [33]:
pd.DataFrame(results).drop(columns='opponent').to_csv(f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}/performance.csv', index=False)

In [12]:
agent_params = AgentParams(name='dqn', network='transformer', memory_sequence_length=128)
experiment_params = ExperimentParams(log_dir=f'./')
env, agent = create_env(RandomAgent(4), experiment_params, agent_params=agent_params)

--> Running on the CPU
dqn


In [17]:
agent.memory.max_sequence_length

10

In [8]:
training_params = TrainingParams(switch_opponents_every=float('inf'), num_steps=50000)
agent_params = AgentParams(name='dqn', network='mlp', memory_sequence_length=128)
experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/{agent_params.name}/{agent_params.network}')
opponents = [LeducHoldemRuleAgentBluff(), LeducHoldemRuleAgentCall()]


opponents_to_results_raw = {}
for opponent in [LeducHoldemRuleAgentBluff()]:
    experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/{opponent.__class__.__name__}/{agent_params.name}/{agent_params.network}')
    results, agent = train(opponent, experiment_params, training_params, agent_params)
    plot_results(results)
    opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}

experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/all/{agent_params.name}/{agent_params.network}')
results, agent = train(opponents, experiment_params, training_params, agent_params)

plot_results(results)

Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
dqn

----------------------------------------
  episode      |  0
  reward       |  1.23
----------------------------------------
INFO - Step 100, rl-loss: 2.9780619144439697
INFO - Copied model parameters to target network.
INFO - Step 145, rl-loss: 2.4218301773071298
----------------------------------------
  episode      |  100
  reward       |  0.145
----------------------------------------
INFO - Step 267, rl-loss: 1.1350542306900024
----------------------------------------
  episode      |  200
  reward       |  1.305
----------------------------------------
INFO - Step 396, rl-loss: 1.22061085700988775
----------------------------------------
  episode      |  300
  reward       |  1.715
----------------------------------------
INFO - Step 535, rl-loss: 0.82336914539337166
----------------------------------------
  episode      |  400
  reward       |  1.45
----------------------------------------
INFO - Step 

Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
dqn

----------------------------------------
  episode      |  0
  reward       |  1.23
----------------------------------------
INFO - Step 100, rl-loss: 2.9780619144439697
INFO - Copied model parameters to target network.
INFO - Step 145, rl-loss: 2.4218301773071298
----------------------------------------
  episode      |  100
  reward       |  0.145
----------------------------------------
INFO - Step 267, rl-loss: 1.1350542306900024
----------------------------------------
  episode      |  200
  reward       |  1.305
----------------------------------------
INFO - Step 396, rl-loss: 1.22061085700988775
----------------------------------------
  episode      |  300
  reward       |  1.715
----------------------------------------
INFO - Step 535, rl-loss: 0.82336914539337166
----------------------------------------
  episode      |  400
  reward       |  1.45
----------------------------------------
INFO - Step 

In [38]:
training_params = TrainingParams(switch_opponents_every=float('inf'), num_steps=50000)
agent_params = AgentParams(name='nfsp', network='transformer', memory_sequence_length=128)
experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/{agent_params.name}/{agent_params.network}')
opponents = [LeducHoldemRuleAgentBluff(), LeducHoldemRuleAgentCall()]


opponents_to_results_raw = {}
for opponent in opponents:
    results, agent = train(opponent, experiment_params, training_params, agent_params)
    plot_results(results)
    opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}

plot_results(results)

Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
nfsp

----------------------------------------
  episode      |  0
  reward       |  0.39
----------------------------------------
INFO - Step 100, rl-loss: 3.3637237548828125
INFO - Copied model parameters to target network.
INFO - Step 159, rl-loss: 7.6285638809204147
----------------------------------------
  episode      |  100
  reward       |  0.2
----------------------------------------
INFO - Step 302, rl-loss: 2.9802639484405518
----------------------------------------
  episode      |  200
  reward       |  0.25
----------------------------------------
INFO - Step 452, rl-loss: 2.4895014762878425
----------------------------------------
  episode      |  300
  reward       |  0.005
----------------------------------------
INFO - Step 595, rl-loss: 1.0768429040908813
----------------------------------------
  episode      |  400
  reward       |  0.18
----------------------------------------
INFO - Step 735,

Switched opponent to LeducHoldemRuleAgentCall
--> Running on the CPU
nfsp

----------------------------------------
  episode      |  0
  reward       |  0.42
----------------------------------------
INFO - Step 100, rl-loss: 1.1133671998977661
INFO - Copied model parameters to target network.
INFO - Step 112, rl-loss: 2.3707506656646734
----------------------------------------
  episode      |  100
  reward       |  0.095
----------------------------------------
INFO - Step 209, rl-loss: 2.61794137954711943
----------------------------------------
  episode      |  200
  reward       |  0.01
----------------------------------------
INFO - Step 308, rl-loss: 4.63950920104980575
----------------------------------------
  episode      |  300
  reward       |  0.25
----------------------------------------
INFO - Step 421, rl-loss: 1.67451715469360355
----------------------------------------
  episode      |  400
  reward       |  0.365
----------------------------------------
INFO - Step 

In [39]:
training_params = TrainingParams(switch_opponents_every=float('inf'), num_steps=50000)
agent_params = AgentParams(name='nfsp', network='mlp', memory_sequence_length=128)
experiment_params = ExperimentParams(log_dir=f'./experiments/bluff_call/{agent_params.name}/{agent_params.network}')
opponents = [LeducHoldemRuleAgentBluff(), LeducHoldemRuleAgentCall()]


opponents_to_results_raw = {}
for opponent in opponents:
    results, agent = train(opponent, experiment_params, training_params, agent_params)
    plot_results(results)
    opponents_to_results_raw[opponent.__class__.__name__] = {'results': results, 'agent': agent}

plot_results(results)

Switched opponent to LeducHoldemRuleAgentBluff
--> Running on the CPU
nfsp

----------------------------------------
  episode      |  0
  reward       |  -0.115
----------------------------------------
INFO - Step 100, rl-loss: 3.0089609622955322
INFO - Copied model parameters to target network.
INFO - Step 133, rl-loss: 3.1791982650756836
----------------------------------------
  episode      |  100
  reward       |  -0.035
----------------------------------------
INFO - Step 270, rl-loss: 1.6340906620025635
----------------------------------------
  episode      |  200
  reward       |  0.14
----------------------------------------
INFO - Step 415, rl-loss: 2.1545839309692383
----------------------------------------
  episode      |  300
  reward       |  0.095
----------------------------------------
INFO - Step 548, rl-loss: 3.6204886436462402
----------------------------------------
  episode      |  400
  reward       |  -0.085
----------------------------------------
INFO - St

Switched opponent to LeducHoldemRuleAgentCall
--> Running on the CPU
nfsp

----------------------------------------
  episode      |  0
  reward       |  0.32
----------------------------------------
INFO - Step 100, rl-loss: 1.3408849239349365
INFO - Copied model parameters to target network.
INFO - Step 104, rl-loss: 1.7550823688507087
----------------------------------------
  episode      |  100
  reward       |  0.31
----------------------------------------
INFO - Step 195, rl-loss: 1.8384375572204598
----------------------------------------
  episode      |  200
  reward       |  0.225
----------------------------------------
INFO - Step 291, rl-loss: 1.96434545516967774
----------------------------------------
  episode      |  300
  reward       |  0.095
----------------------------------------
INFO - Step 383, rl-loss: 1.30744946002960244
----------------------------------------
  episode      |  400
  reward       |  0.0
----------------------------------------
INFO - Step 48

In [50]:
opponents_to_results = {opponent: value['results'] for opponent, value in opponents_to_results_raw.items()}
max_episode = min(value[-1]['episode'] for value in opponents_to_results.values())
opponents_to_results = {opponent: [result for result in value if result['episode'] <= max_episode] for opponent, value in opponents_to_results.items()}
data = [{'opponents': opponent, **result} for opponent, results in opponents_to_results.items() for result in results]
df = pd.DataFrame(data)

In [54]:
df.groupby(['opponents', 'opponent']).mean().drop(columns='episode')

Unnamed: 0_level_0,Unnamed: 1_level_0,performance
opponents,opponent,Unnamed: 2_level_1
FoldingAgent,FoldingAgent,0.746466
LeducHoldemRuleAgentBluff,LeducHoldemRuleAgentBluff,2.085975
LeducHoldemRuleAgentCall,LeducHoldemRuleAgentCall,0.996305
RandomAgent,RandomAgent,1.419424
all,FoldingAgent,0.749349
all,LeducHoldemRuleAgentBluff,2.09875
all,LeducHoldemRuleAgentCall,1.032808
all,RandomAgent,1.541404


In [36]:
data = []
for opponent, results in opponents_to_results.items():
    for result in results:
        data.append({**result, 'opponent': opponent})
        print({**result, 'opponent': opponent})

{'episode': 0, 'performance': 1.23, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 100, 'performance': 0.145, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 200, 'performance': 1.1, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 300, 'performance': 1.405, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 400, 'performance': 1.45, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 500, 'performance': 1.375, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 600, 'performance': 1.375, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 700, 'performance': 1.61, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 800, 'performance': 1.595, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 900, 'performance': 1.59, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 1000, 'performance': 2.08, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 1100, 'performance': 1.935, 'opponent': 'LeducHoldemRuleAgentBluff'}
{'episode': 1200, 'performance': 1.98, 'opponen

In [39]:
df = pd.DataFrame(data)
df.groupby('opponent').median()

Unnamed: 0_level_0,episode,performance
opponent,Unnamed: 1_level_1,Unnamed: 2_level_1
FoldingAgent,3400.0,0.75
LeducHoldemRuleAgentBluff,3400.0,2.055
LeducHoldemRuleAgentCall,3400.0,0.895
all,3400.0,1.045


In [5]:
training_params = TrainingParams(switch_opponents_every=100, num_steps=100000)
agent_params = AgentParams()
experiment_params = ExperimentParams(log_dir='./multiple_agents/')
opponents = [CallingAgent(), RaisingAgent(), RandomAgent(4), FoldingAgent(), CheckingAgent()]

In [42]:
for opponent in opponents:
    results, agent = train(opponent, experiment_params, training_params, agent_params)
    plot_results(results)

results, agent = train(opponents, experiment_params, training_params, agent_params)
plot_results(results)

Switched opponent to CallingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.285
----------------------------------------
INFO - Step 100, rl-loss: 1.911059856414795
INFO - Copied model parameters to target network.
INFO - Step 165, rl-loss: 1.7441142797470093
----------------------------------------
  episode      |  100
  reward       |  0.13
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to CallingAgent
INFO - Step 322, rl-loss: 0.37976408004760746
----------------------------------------
  episode      |  200
  reward       |  0.63
----------------------------------------
INFO - Step 324, rl-loss: 0.40481364727020264
Logs saved in ./multiple_agents/
Switched opponent to CallingAgent
INFO - Step 480, rl-loss: 1.94605565071105967
----------------------------------------
  episode      |  300
  reward       |  0.42
----------------------------------------
INFO - Step 482, rl-los

Switched opponent to RaisingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  1.825
----------------------------------------
INFO - Step 100, rl-loss: 7.525012969970703
INFO - Copied model parameters to target network.
INFO - Step 195, rl-loss: 5.5315299034118654
----------------------------------------
  episode      |  100
  reward       |  0.15
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RaisingAgent
INFO - Step 378, rl-loss: 4.5337605476379395
----------------------------------------
  episode      |  200
  reward       |  0.685
----------------------------------------
INFO - Step 379, rl-loss: 4.8599042892456055
Logs saved in ./multiple_agents/
Switched opponent to RaisingAgent
INFO - Step 561, rl-loss: 1.85765826702117925
----------------------------------------
  episode      |  300
  reward       |  1.425
----------------------------------------
INFO - Step 564, rl-los

Switched opponent to RandomAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.71
----------------------------------------
INFO - Step 100, rl-loss: 1.4091770648956299
INFO - Copied model parameters to target network.
INFO - Step 133, rl-loss: 2.2652642726898193
----------------------------------------
  episode      |  100
  reward       |  -0.04
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RandomAgent
INFO - Step 266, rl-loss: 1.6030160188674927
----------------------------------------
  episode      |  200
  reward       |  1.095
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RandomAgent
INFO - Step 396, rl-loss: 0.68101942539215094
----------------------------------------
  episode      |  300
  reward       |  1.14
----------------------------------------
INFO - Step 397, rl-loss: 1.8365583419799805
Logs saved in ./multiple_

Switched opponent to FoldingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.27
----------------------------------------

----------------------------------------
  episode      |  100
  reward       |  0.16
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to FoldingAgent

----------------------------------------
  episode      |  200
  reward       |  0.235
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to FoldingAgent
INFO - Step 100, rl-loss: 0.6226616501808167
INFO - Copied model parameters to target network.
INFO - Step 145, rl-loss: 0.35948494076728826
----------------------------------------
  episode      |  300
  reward       |  0.76
----------------------------------------
INFO - Step 146, rl-loss: 0.4063968062400818
Logs saved in ./multiple_agents/
Switched opponent to FoldingAgent
INFO - Step 191, rl-loss: 0.2497247159481048

Switched opponent to CheckingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.57
----------------------------------------
INFO - Step 100, rl-loss: 2.797032594680786
INFO - Copied model parameters to target network.
INFO - Step 152, rl-loss: 1.4550185203552246
----------------------------------------
  episode      |  100
  reward       |  -0.11
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to CheckingAgent
INFO - Step 272, rl-loss: 0.77120685577392584
----------------------------------------
  episode      |  200
  reward       |  0.04
----------------------------------------
INFO - Step 276, rl-loss: 0.29054778814315796
Logs saved in ./multiple_agents/
Switched opponent to CheckingAgent
INFO - Step 395, rl-loss: 0.85471981763839726
----------------------------------------
  episode      |  300
  reward       |  0.345
----------------------------------------
INFO - Step 396, rl

Switched opponent to CallingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.285
----------------------------------------
INFO - Step 100, rl-loss: 1.911059856414795
INFO - Copied model parameters to target network.
INFO - Step 165, rl-loss: 1.7441142797470093
----------------------------------------
  episode      |  100
  reward       |  0.13
----------------------------------------

Logs saved in ./multiple_agents/
Switched opponent to RaisingAgent
INFO - Step 349, rl-loss: 1.79754519462585456
----------------------------------------
  episode      |  200
  reward       |  1.12
----------------------------------------
INFO - Step 350, rl-loss: 2.819002866744995
Logs saved in ./multiple_agents/
Switched opponent to RandomAgent
INFO - Step 480, rl-loss: 3.9415473937988285
----------------------------------------
  episode      |  300
  reward       |  0.945
----------------------------------------
INFO - Step 481, rl-loss: 

In [43]:
training_params = TrainingParams(switch_opponents_every=5000, num_steps=100000)
agent_params = AgentParams()
experiment_params = ExperimentParams(log_dir='./multiple_agents/5000/')
opponents = [CallingAgent(), RaisingAgent(), RandomAgent(4), FoldingAgent(), CheckingAgent()]

In [44]:
for opponent in opponents:
    results, agent = train(opponent, experiment_params, training_params, agent_params)
    plot_results(results)

results, agent = train(opponents, experiment_params, training_params, agent_params)
plot_results(results)

Switched opponent to CallingAgent


--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.285
----------------------------------------
INFO - Step 100, rl-loss: 1.911059856414795
INFO - Copied model parameters to target network.
INFO - Step 165, rl-loss: 1.7441142797470093
----------------------------------------
  episode      |  100
  reward       |  0.13
----------------------------------------
INFO - Step 330, rl-loss: 2.38386297225952156
----------------------------------------
  episode      |  200
  reward       |  0.51
----------------------------------------
INFO - Step 491, rl-loss: 2.69745635986328124
----------------------------------------
  episode      |  300
  reward       |  0.63
----------------------------------------
INFO - Step 658, rl-loss: 1.15135562419891367
----------------------------------------
  episode      |  400
  reward       |  0.46
----------------------------------------
INFO - Step 817, rl-loss: 3.69314694404602053
-------------------

Switched opponent to RaisingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  1.825
----------------------------------------
INFO - Step 100, rl-loss: 7.525012969970703
INFO - Copied model parameters to target network.
INFO - Step 195, rl-loss: 5.5315299034118654
----------------------------------------
  episode      |  100
  reward       |  0.15
----------------------------------------
INFO - Step 372, rl-loss: 2.17288494110107468
----------------------------------------
  episode      |  200
  reward       |  0.17
----------------------------------------
INFO - Step 545, rl-loss: 3.81930875778198246
----------------------------------------
  episode      |  300
  reward       |  0.98
----------------------------------------
INFO - Step 732, rl-loss: 0.17358103394508362
----------------------------------------
  episode      |  400
  reward       |  1.85
----------------------------------------
INFO - Step 904, rl-loss: 3.287

Switched opponent to RandomAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.71
----------------------------------------
INFO - Step 100, rl-loss: 1.4091770648956299
INFO - Copied model parameters to target network.
INFO - Step 133, rl-loss: 2.2652642726898193
----------------------------------------
  episode      |  100
  reward       |  -0.04
----------------------------------------
INFO - Step 252, rl-loss: 0.36430895328521733
----------------------------------------
  episode      |  200
  reward       |  0.725
----------------------------------------
INFO - Step 373, rl-loss: 1.22511029243469243
----------------------------------------
  episode      |  300
  reward       |  1.335
----------------------------------------
INFO - Step 507, rl-loss: 1.70922696590423586
----------------------------------------
  episode      |  400
  reward       |  0.865
----------------------------------------
INFO - Step 638, rl-loss: 3.

Switched opponent to FoldingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.27
----------------------------------------

----------------------------------------
  episode      |  100
  reward       |  0.16
----------------------------------------
INFO - Step 100, rl-loss: 0.7277150750160217
INFO - Copied model parameters to target network.
INFO - Step 105, rl-loss: 0.7217839956283569
----------------------------------------
  episode      |  200
  reward       |  0.13
----------------------------------------
INFO - Step 150, rl-loss: 0.39784502983093264
----------------------------------------
  episode      |  300
  reward       |  0.76
----------------------------------------
INFO - Step 205, rl-loss: 0.13889811933040624
----------------------------------------
  episode      |  400
  reward       |  0.745
----------------------------------------
INFO - Step 252, rl-loss: 0.08540394902229309
-----------------------------

Switched opponent to CheckingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.57
----------------------------------------
INFO - Step 100, rl-loss: 2.797032594680786
INFO - Copied model parameters to target network.
INFO - Step 152, rl-loss: 1.4550185203552246
----------------------------------------
  episode      |  100
  reward       |  -0.11
----------------------------------------
INFO - Step 285, rl-loss: 0.73994100093841552
----------------------------------------
  episode      |  200
  reward       |  0.395
----------------------------------------
INFO - Step 427, rl-loss: 0.90939629077911384
----------------------------------------
  episode      |  300
  reward       |  0.46
----------------------------------------
INFO - Step 566, rl-loss: 0.64119160175323496
----------------------------------------
  episode      |  400
  reward       |  0.635
----------------------------------------
INFO - Step 694, rl-loss: 0.

Switched opponent to CallingAgent
--> Running on the CPU

----------------------------------------
  episode      |  0
  reward       |  0.285
----------------------------------------
INFO - Step 100, rl-loss: 1.911059856414795
INFO - Copied model parameters to target network.
INFO - Step 165, rl-loss: 1.7441142797470093
----------------------------------------
  episode      |  100
  reward       |  0.13
----------------------------------------
INFO - Step 330, rl-loss: 2.38386297225952156
----------------------------------------
  episode      |  200
  reward       |  0.51
----------------------------------------
INFO - Step 491, rl-loss: 2.69745635986328124
----------------------------------------
  episode      |  300
  reward       |  0.63
----------------------------------------
INFO - Step 658, rl-loss: 1.15135562419891367
----------------------------------------
  episode      |  400
  reward       |  0.46
----------------------------------------
INFO - Step 817, rl-loss: 3.693