https://medium.com/analytics-vidhya/how-to-create-a-custom-gym-environment-with-multiple-agents-f368d13582ee  
https://medium.com/analytics-vidhya/custom-gym-environment-with-agents-that-collaborate-4f96ef898a2a

In [10]:
import gym
from gym import spaces
import numpy as np

In [108]:
class MazeEnv(gym.Env):
    def __init__(self, width=10, height=12):
        self.width = width
        self.height = height
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(low=0,
                                            high=3,
                                            shape=(height, width),
                                            dtype=np.int16)
        self.reward_range = (-200, 200)
        self.current_episode = 0
        self.success_episode = []

    def reset(self):
        self.current_step = 0
        self.max_step = 30
        
        self.state = "P"
        
        self.world = np.zeros((self.height, self.width), dtype=int)
        
        # Set exit 
        self.exit_location = [self.height-1, self.width-1]
        self.world[self.exit_location[0], self.exit_location[1]] = 2

        # Set 5 traps
        self.trap_locations = []
        for i in range(5):
            zero_locations = np.where(self.world == 0)
            random_index = np.random.randint(len(indices[0]))
            trap_location = [zero_locations[0][random_index], zero_locations[1][random_index]]
            self.trap_locations.append(trap_location)
            self.world[trap_location[0], trap_location[1]] = 3
            

        # set player
        zero_locations = np.where(self.world == 0)
        random_index = np.random.randint(len(indices[0]))
        self.player_location = [zero_locations[0][random_index], zero_locations[1][random_index]]
        self.world[self.player_location[0], self.player_location[1]] = 1

        return self.world
    
    
    
    def step(self, action):
        if action == 3 and self.player_location[1] != 0 : # left
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[1] -= 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 2 and self.player_location[0] != (self.height-1) : # down
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[0] += 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 1 and self.player_location[1] != (self.width-1) : # right
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[1] += 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 0 and self.player_location[0] != 0 : # up
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[0] -= 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        self.current_step += 1
        
        if self.player_location == self.exit_location:
            self.state = "W"
#             print(f'You won!')
            reward = 200
            done = True
        elif self.player_location in self.trap_locations:
            self.state = "L"
#             print(f'You lost - Stepped on a trap...')
            reward = -200
            done = True 
        elif self.current_step == self.max_step:
            self.state = "L"
#             print("You lost - Didn't make it in time...")
            reward = -200
            done = True    
        elif self.state == 'P':
            reward = -1
            done = False
        
        obs = self.world
        return obs, reward, done, {}
    
    def render(self):
        print(self.world)
        print()
        

In [109]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

env = DummyVecEnv([lambda: MazeEnv()])
model = PPO2(MlpPolicy, env, learning_rate=0.01, verbose=1)
model.learn(10_000, log_interval=10)

-------------------------------------
| approxkl           | 0.049079027  |
| clipfrac           | 0.5234375    |
| explained_variance | 0.00144      |
| fps                | 369          |
| n_updates          | 1            |
| policy_entropy     | 1.3408334    |
| policy_loss        | -0.052190557 |
| serial_timesteps   | 128          |
| time_elapsed       | 0            |
| total_timesteps    | 128          |
| value_loss         | 9128.413     |
-------------------------------------
--------------------------------------
| approxkl           | 0.019682202   |
| clipfrac           | 0.21484375    |
| explained_variance | -0.00135      |
| fps                | 1662          |
| n_updates          | 10            |
| policy_entropy     | 0.93388456    |
| policy_loss        | 0.00067575416 |
| serial_timesteps   | 1280          |
| time_elapsed       | 1.08          |
| total_timesteps    | 1280          |
| value_loss         | 6125.9043     |
--------------------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x24b60fb4088>

In [None]:
def check_pygame_exit():
    """ Easy exit from pygame when closing. It will crash otherwise """
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            pygame.quit()
    return True

In [121]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    
    if dones:
        break

[[0 0 0 0 0 0 0 3 3 0]
 [0 0 0 3 0 0 0 0 3 1]
 [0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2]]

[[0 0 0 0 0 0 0 3 3 0]
 [0 0 0 3 0 0 0 0 3 0]
 [0 3 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2]]

[[0 0 0 0 0 0 0 3 3 0]
 [0 0 0 3 0 0 0 0 3 0]
 [0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2]]

[[0 0 0 0 0 0 0 3 3 0]
 [0 0 0 3 0 0 0 0 3 0]
 [0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0

In [65]:
import numpy as np

In [73]:
a = np.zeros([5, 6], dtype=int); a

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
def convert_coord_to_matrix(point, matrix):
    return (matrix.shape[0] - point[1] -1, point[0])

In [74]:
point = [3, 1]

In [75]:
a[(a.shape[0] - point[1] -1, point[0])] = 1

In [76]:
a

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [53]:
class MazeMultiAgentEnv(gym.Env):
    def __init__(self, width=10, height=12):
        self.width = width
        self.height = height
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(low=0,
                                            high=2,
                                            shape=(height, width),
                                            dtype=np.int16)
        self.reward_range = (-200, 200)
        self.current_episode = 0
        self.success_episode = []

    def reset(self):
        self.current_step = 0
        self.max_step = 30
        
        self.state = "P"
        
        self.world = np.zeros((self.height, self.width), dtype=int)
        
        
        while True:
            self.player_one_location = [np.random.randint(self.height-2), np.random.randint(self.width-2)]
            if self.world[self.player_one_location[0], self.player_one_location[1]] == 0:
                self.world[self.player_one_location[0], self.player_one_location[1]] == 1
                break
                
        while True:
            self.player_two_location = [np.random.randint(self.height-2), np.random.randint(self.width-2)]
            if self.world[self.player_two_location[0], self.player_two_location[1]] == 0:
                self.world[self.player_two_location[0], self.player_two_location[1]] == 1
                break        
        
        
        self.exit_location = [self.height-1, self.width-1]
        
        self.world[self.player_location[0], self.player_location[1]] = 1
        self.world[self.exit_location[0], self.exit_location[1]] = 2
        
        return self.world
    
    
    
    def step(self, action):
        if action == 3 and self.player_location[1] != 0 : # left
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[1] -= 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 2 and self.player_location[0] != (self.height-1) : # down
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[0] += 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 1 and self.player_location[1] != (self.width-1) : # right
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[1] += 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 0 and self.player_location[0] != 0 : # up
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[0] -= 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        self.current_step += 1
        
        if self.player_location == self.exit_location:
            self.state = "W"
            print(f'You won!')
            reward = 200
            done = True
        elif self.current_step == self.max_step:
            self.state = "L"
            print(f'You lost')
            reward = -200
            done = True    
        elif self.state == 'P':
            reward = -1
            done = False
        
        obs = self.world
        return obs, reward, done, {}
    
    def render(self):
        print(self.world)
        

In [54]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2


env = DummyVecEnv([lambda: MazeEnv()])
model = PPO2(MlpPolicy, env, learning_rate=0.001)
model.learn(500000)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
[[1 0 0 2]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[1 0 0 2]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 1 0 2]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 1 2 0]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 1 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 1 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 2 0 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 2 0 0]
 [0 1 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 1 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 1 2 0]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 1 2 0]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[1 0 2 0]
 [0 0 0 0]

FileNotFoundError: [Errno 2] No such file or directory: 'render/render.txt'

In [82]:
10 / 10_000

0.001

In [95]:
np.random.random()

0.9913651379178319