In [1]:
from dlgo.agent.pg import load_policy_agent
from dlgo.encoders.alphago import AlphaGoEncoder
from dlgo.networks.alphago import AlphaGoValueResNet
from dlgo.rl.simulate import experience_simulation
from dlgo.rl.value import ValueAgent

In [6]:
agent1 = load_policy_agent(type='SL', version='v1')
agent2 = load_policy_agent(type='SL', version='v1')

encoder = AlphaGoEncoder(use_player_plane=False)
model = AlphaGoValueResNet()
agent3 = ValueAgent(model, encoder)
agent4 = ValueAgent(model, encoder)

In [3]:
num_games = 1
winning_p_experiences, losing_p_experiences, winning_v_experiences, losing_v_experiences = experience_simulation(
    num_games, alphago_rl_agent, opponent, dummy_agent, dummy_agent
)

Simulating game 1/1...
B+25.5


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 4 dimension(s) and the array at index 1 has 1 dimension(s)

In [4]:
from dlgo import rl
from dlgo import scoring
from dlgo import goboard_fast as goboard
from dlgo.gotypes import Player

from collections import namedtuple

In [7]:
collector1 = rl.ExperienceCollector()
collector2 = rl.ExperienceCollector()
collector3 = rl.ExperienceCollector()
collector4 = rl.ExperienceCollector()

color1 = Player.black
for i in range(num_games):
    print('Simulating game %d/%d...' % (i + 1, num_games))
    collector1.begin_episode()
    agent1.set_collector(collector1)
    collector2.begin_episode()
    agent2.set_collector(collector2)
    collector3.begin_episode()
    agent3.set_collector(collector3)
    collector4.begin_episode()
    agent4.set_collector(collector4)

Simulating game 1/1...


In [10]:
class GameRecord(namedtuple('GameRecord', 'moves winner margin')):
    pass

def simulate_game(black_player, white_player, black_value_agent, white_value_agent):
    moves = []
    game = goboard.GameState.new_game(19)
    agents = {
        Player.black: black_player,
        Player.white: white_player,
    }
    while not game.is_over():
        next_move = agents[game.next_player].select_move(game)
        moves.append(next_move)

        if game.next_player == Player.black:
            black_value_agent.collector.record_decision(
                state=black_value_agent.encoder.encode(game),
                # action=black_value_agent.encoder.encode_point(next_move.point),
            )
        elif game.next_player == Player.white:
            white_value_agent.collector.record_decision(
                state=white_value_agent.encoder.encode(game),
                # action=white_value_agent.encoder.encode_point(next_move.point),
            )
        game = game.apply_move(next_move)
    
    game_result = scoring.compute_game_result(game)
    print(game_result)

    return GameRecord(
        moves=moves,
        winner=game_result.winner,
        margin=game_result.winning_margin,
    )

In [42]:
if color1 == Player.black:
    # black_player, white_player = agent1, agent2
    game_record = simulate_game(agent1, agent2, agent3, agent4)
else:
    # white_player, black_player = agent1, agent2
    game_record = simulate_game(agent2, agent1, agent4, agent3)

B+59.5


In [43]:
agent4.collector._current_episode_states

[array([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]],
 
        ...,
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],


In [44]:
if game_record.winner == color1:
    collector1.complete_episode(reward=1)
    collector2.complete_episode(reward=-1)
    collector3.complete_episode(reward=1)
    collector4.complete_episode(reward=-1)
else:
    collector2.complete_episode(reward=1)
    collector1.complete_episode(reward=-1)
    collector4.complete_episode(reward=1)
    collector3.complete_episode(reward=-1)

In [46]:
len(collector4.w_states)

242

In [47]:
rl.combine_experience([collector1, collector2])

(<dlgo.rl.experience.ExperienceBuffer at 0x26bb6058a50>,
 <dlgo.rl.experience.ExperienceBuffer at 0x26bb5ffdd10>)

In [49]:
a = rl.combine_experience([collector3, collector4])

In [61]:
a[0].states.shape

(1003, 48, 19, 19)

In [59]:
b = rl.combine_experience([collector1, collector2])

In [60]:
b[0].states.shape

(999, 48, 19, 19)