In [3]:
import random
import numpy as np
from collections import defaultdict
import dill

In [4]:
from pettingzoo.classic import tictactoe_v3

In [5]:
# Creating the environment:
env = tictactoe_v3.env()

In [36]:
#resets the environment
env.reset()

In [37]:
#List the agents:
env.agents

['player_1', 'player_2']

In [38]:
env.reset()
env.agent_selection

'player_1'

In [39]:
#env.step()
# env acts by updating the observation and 
# switches to the next player
env.agent_selection

'player_1'

In [40]:
help(env.step)

Help on method step in module pettingzoo.utils.wrappers.order_enforcing:

step(action) method of pettingzoo.utils.wrappers.order_enforcing.OrderEnforcingWrapper instance
    Accepts and executes the action of the current agent_selection
    in the environment, automatically switches control to the next agent.



In [41]:
# now player 2 can act
env.step(1)
env.agent_selection

'player_2'

In [43]:
observation, reward, done, info = env.last()

In [48]:
observation['action_mask']

array([1, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int8)

In [55]:
env.reset()
print("No actions taken")
observation, reward, done, info = env.last()
print(observation['action_mask'])
# player 1
env.step(1)
print("One action taken")
observation, reward, done, info = env.last()
print(observation['action_mask'])

# player 2 attempts same move
env.step(0)

print("Last action taken")
observation, reward, done, info = env.last()
print(observation['action_mask'])

No actions taken
[1 1 1 1 1 1 1 1 1]
One action taken
[1 0 1 1 1 1 1 1 1]
Last action taken
[0 0 1 1 1 1 1 1 1]


In [68]:
env.reset()
env.step(0)
env.step(3)
env.step(1)
env.step(4)
env.step(2)
env.render()



observation, reward, done, info = env.last()




env.step(None)

     |     |     
  X  |  O  |  -  
_____|_____|_____
     |     |     
  X  |  O  |  -  
_____|_____|_____
     |     |     
  X  |  -  |  -  
     |     |     
done is: True
Reward is: -1


In [69]:
env.agents

['player_1']

In [70]:
observation, reward, done, info = env.last()

print("done is:", done)
print("Reward is:", reward)

env.step(None)

done is: True
Reward is: 1


In [71]:
# no active agents anymore, need to call env.reset() to start a new game
env.agents

[]

In [73]:
observation["action_mask"]

array([0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int8)

In [74]:
def policy(observation, agent):
    action = random.choice(np.flatnonzero(observation['action_mask']))
    return action

In [114]:
env.reset()
for agent in env.agent_iter():
    print(agent)
    observation, reward, done, info = env.last()
    print(done)
    print(reward)
    action = policy(observation, agent) if not done else None
    env.step(action)
    #env.render() # this visualizes a single game

player_1
False
0
player_2
False
0
player_1
False
0
player_2
False
0
player_1
False
0
player_2
False
0
player_1
False
0
player_2
False
0
player_1
False
0
player_2
True
-1
player_1
True
1


In [115]:
p1_w = 0
p1_d = 0
p1_l = 0

p2_w = 0
p2_d = 0
p2_l = 0


for _ in range(10_000):
    env.reset()
    
    for agent in env.agent_iter():
        observation, reward, done, info = env.last()
        action = policy(observation, agent) if not done else None
        env.step(action)
        
        if done:
            #If we are done, we want to store the reward, as to save who won
            if agent == 'player_1':
                #does it for agent 1:
                if reward == 1:
                    p1_w += 1
                elif reward == 0:
                    p1_d +=1
                elif reward == -1:
                    p1_l += 1
                    
            if agent == 'player_2':
                #does it for agent 2:
                if reward == 1:
                    p2_w += 1
                elif reward == 0:
                    p2_d +=1
                elif reward == -1:
                    p2_l += 1   

In [118]:
print("p1 wins:", p1_w)
print("p1 draws:", p1_d)
print("p1 losses:", p1_l)

print("p2 wins:", p2_w)
print("p2 draws:", p2_d)
print("p2 losses:", p2_l)

p1 wins: 5865
p1 draws: 1264
p1 losses: 2871
p2 wins: 2871
p2 draws: 1264
p2 losses: 5865


In [120]:
p1_w + p1_d + p1_l

10000

In [121]:
p2_w + p2_d + p2_l

10000

In [122]:
env.reset()
observation, reward, done, info = env.last()


observation['observation']

array([[[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]]], dtype=int8)

In [123]:
env.render()

     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     


In [126]:
observation['observation'].shape

(3, 3, 2)

In [134]:
state = hash(str(observation['observation']))

state


-8751529290983810411

In [135]:
import hashlib

def encode_state(observation):
    # encode observation as bytes           
    obs_bytes = str(observation).encode('utf-8')
    # create md5 hash
    m = hashlib.md5(obs_bytes)
    # return hash as hex digest
    state = m.hexdigest()
    return(state)

encode_state(observation['observation'])

'65ea394aefe804468cc42b20ecc8b606'

In [136]:
env = tictactoe_v3.env()
env.reset()
env.step(4)
env.observe('player_1')['observation']

array([[[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [1, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]]], dtype=int8)

In [137]:
env.observe('player_2')['observation']

array([[[0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 1],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0]]], dtype=int8)

In [138]:
env = tictactoe_v3.env()
env.reset()
env.step(0)
env.step(6)
env.step(1)
env.step(5)
env.step(2)

env.observe("player_1")['observation']

array([[[1, 0],
        [1, 0],
        [1, 0]],

       [[0, 0],
        [0, 0],
        [0, 1]],

       [[0, 1],
        [0, 0],
        [0, 0]]], dtype=int8)

In [139]:
env.step(None)
env.observe("player_2")['observation']

array([[[0, 1],
        [0, 1],
        [0, 1]],

       [[0, 0],
        [0, 0],
        [1, 0]],

       [[1, 0],
        [0, 0],
        [0, 0]]], dtype=int8)

In [140]:
state = encode_state(env.render(mode = 'ansi'))

state


'6c2e7780c70c674fc2c99fb84f81de15'

In [141]:
from collections import defaultdict

env.reset()

Q = defaultdict(lambda: np.zeros(nA)) 

# reminder about how default dict works

Q['32433'] = 0
Q['-5323'] = 0
Q['2397887'] = 0

Q

defaultdict(<function __main__.<lambda>()>,
            {'32433': 0, '-5323': 0, '2397887': 0})