In [1]:
from tictactoe_3p_env import TicTacToe3PlayerEnv, print_board, string_to_action, action_to_string
import random

In [2]:
e = TicTacToe3PlayerEnv()

In [3]:
s, current_players = e.new_state()

In [4]:
print_board(s)


   |   |   |   |   
-------------------
   |   |   |   |   
-------------------
   |   |   |   |   



In [5]:
e.valid_actions(s, current_players[0])

['(0, 0)',
 '(0, 1)',
 '(0, 2)',
 '(0, 3)',
 '(0, 4)',
 '(1, 0)',
 '(1, 1)',
 '(1, 2)',
 '(1, 3)',
 '(1, 4)',
 '(2, 0)',
 '(2, 1)',
 '(2, 2)',
 '(2, 3)',
 '(2, 4)']

In [6]:
def rl_agent_choose_action(valid_actions, observation):
    # dummy function, chooses a random action
    return random.choice(valid_actions)

# Instantiate a blockus environment, you only need to do this once.
env = TicTacToe3PlayerEnv()

# start a new game, we get back a state, and a list of currently acting players.
# In blockus, this current_players list always contains only one player
# since only one player ever is allowed to go at a time.
current_state, current_players = env.new_state()

# Main game loop
while True:
    
    print_board(current_state)
    
    # Get a list of valid actions. In this API, actions are pass around as strings.
    # The action_to_string and string_to_action methods can be used to parse
    #or create actions from components.
    valid_actions = env.valid_actions(
        state=current_state, player=current_players[0])

    # States are only for the environment to use. We need to convert a state into
    # an observation for an RL agent to look at it.
    observation = env.state_to_observation(
        state=current_state, player=current_players[0])
    # Observations are always from a specific player's perspective.
    # Each player views observations as if they are player 0 (out of 0,1,2,3)
    # in the game.
    # This way, you only ever have to train a agent to choose actions
    # that make player 0 win the game.

    # Our "RL agent" chooses and action base on the observation and valid actions
    action = rl_agent_choose_action(valid_actions, observation)

    # Here we advance the game by one step.
    next_state, next_players, rewards, terminal, winners = env.next_state(
        state=current_state,players=current_players,actions=[action])


    # The BlockEnv instance, env, doesn't keep track of the game's state. That's up to you to keep track of:
    current_state = next_state

    # next_players contains the next player who's turn it is now (again, it's a list with one player's number).
    current_players = next_players

    if terminal:
        # If the game is over, break out of the loop and print which player(s) won.
        print("Winners: {}".format(winners))
        break
        
print_board(current_state)


   |   |   |   |   
-------------------
   |   |   |   |   
-------------------
   |   |   |   |   


   |   |   |   | X 
-------------------
   |   |   |   |   
-------------------
   |   |   |   |   


   |   |   |   | X 
-------------------
   |   |   |   |   
-------------------
 O |   |   |   |   


   |   |   |   | X 
-------------------
   |   |   |   |   
-------------------
 O |   |   | Y |   


   |   |   |   | X 
-------------------
   |   | X |   |   
-------------------
 O |   |   | Y |   


   |   | O |   | X 
-------------------
   |   | X |   |   
-------------------
 O |   |   | Y |   


   | Y | O |   | X 
-------------------
   |   | X |   |   
-------------------
 O |   |   | Y |   


   | Y | O |   | X 
-------------------
   |   | X |   | X 
-------------------
 O |   |   | Y |   

Winners: [1]

Winner: 1
   | Y | O |   | X 
-------------------
   | O | X |   | X 
-------------------
 O |   |   | Y |   



In [7]:
print_board(current_state)


Winner: 1
   | Y | O |   | X 
-------------------
   | O | X |   | X 
-------------------
 O |   |   | Y |   



In [8]:
current_state

(3, (array([[-1,  2,  1, -1,  0],
         [-1,  1,  0, -1,  0],
         [ 1, -1, -1,  2, -1]], dtype=int8), 1))

In [9]:
ser = e.serialize_state(current_state)

In [10]:
s = e.deserialize_state(ser)
s


(3, (array([[-1,  2,  1, -1,  0],
         [-1,  1,  0, -1,  0],
         [ 1, -1, -1,  2, -1]], dtype=int8), 1))

In [13]:
e.state_to_observation(s, 2)

{'board': array([[-1,  0,  2, -1,  1],
        [-1,  2,  1, -1,  1],
        [ 2, -1, -1,  0, -1]], dtype=int8)}