https://medium.com/analytics-vidhya/how-to-create-a-custom-gym-environment-with-multiple-agents-f368d13582ee  
https://medium.com/analytics-vidhya/custom-gym-environment-with-agents-that-collaborate-4f96ef898a2a

In [10]:
import gym
from gym import spaces
import numpy as np

In [108]:
class MazeEnv(gym.Env):
    def __init__(self, width=10, height=12):
        self.width = width
        self.height = height
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(low=0,
                                            high=3,
                                            shape=(height, width),
                                            dtype=np.int16)
        self.reward_range = (-200, 200)
        self.current_episode = 0
        self.success_episode = []

    def reset(self):
        self.current_step = 0
        self.max_step = 30
        
        self.state = "P"
        
        self.world = np.zeros((self.height, self.width), dtype=int)
        
        # Set exit 
        self.exit_location = [self.height-1, self.width-1]
        self.world[self.exit_location[0], self.exit_location[1]] = 2

        # Set 5 traps
        self.trap_locations = []
        for i in range(5):
            zero_locations = np.where(self.world == 0)
            random_index = np.random.randint(len(indices[0]))
            trap_location = [zero_locations[0][random_index], zero_locations[1][random_index]]
            self.trap_locations.append(trap_location)
            self.world[trap_location[0], trap_location[1]] = 3
            

        # set player
        zero_locations = np.where(self.world == 0)
        random_index = np.random.randint(len(indices[0]))
        self.player_location = [zero_locations[0][random_index], zero_locations[1][random_index]]
        self.world[self.player_location[0], self.player_location[1]] = 1

        return self.world
    
    
    
    def step(self, action):
        if action == 3 and self.player_location[1] != 0 : # left
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[1] -= 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 2 and self.player_location[0] != (self.height-1) : # down
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[0] += 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 1 and self.player_location[1] != (self.width-1) : # right
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[1] += 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 0 and self.player_location[0] != 0 : # up
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[0] -= 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        self.current_step += 1
        
        if self.player_location == self.exit_location:
            self.state = "W"
#             print(f'You won!')
            reward = 200
            done = True
        elif self.player_location in self.trap_locations:
            self.state = "L"
#             print(f'You lost - Stepped on a trap...')
            reward = -200
            done = True 
        elif self.current_step == self.max_step:
            self.state = "L"
#             print("You lost - Didn't make it in time...")
            reward = -200
            done = True    
        elif self.state == 'P':
            reward = -1
            done = False
        
        obs = self.world
        return obs, reward, done, {}
    
    def render(self):
        print(self.world)
        print()
        

In [109]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

env = DummyVecEnv([lambda: MazeEnv()])
model = PPO2(MlpPolicy, env, learning_rate=0.01, verbose=1)
model.learn(10_000, log_interval=10)

-------------------------------------
| approxkl           | 0.049079027  |
| clipfrac           | 0.5234375    |
| explained_variance | 0.00144      |
| fps                | 369          |
| n_updates          | 1            |
| policy_entropy     | 1.3408334    |
| policy_loss        | -0.052190557 |
| serial_timesteps   | 128          |
| time_elapsed       | 0            |
| total_timesteps    | 128          |
| value_loss         | 9128.413     |
-------------------------------------
--------------------------------------
| approxkl           | 0.019682202   |
| clipfrac           | 0.21484375    |
| explained_variance | -0.00135      |
| fps                | 1662          |
| n_updates          | 10            |
| policy_entropy     | 0.93388456    |
| policy_loss        | 0.00067575416 |
| serial_timesteps   | 1280          |
| time_elapsed       | 1.08          |
| total_timesteps    | 1280          |
| value_loss         | 6125.9043     |
--------------------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x24b60fb4088>

In [None]:
def check_pygame_exit():
    """ Easy exit from pygame when closing. It will crash otherwise """
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            pygame.quit()
    return True

In [121]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    
    if dones:
        break

[[0 0 0 0 0 0 0 3 3 0]
 [0 0 0 3 0 0 0 0 3 1]
 [0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2]]

[[0 0 0 0 0 0 0 3 3 0]
 [0 0 0 3 0 0 0 0 3 0]
 [0 3 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2]]

[[0 0 0 0 0 0 0 3 3 0]
 [0 0 0 3 0 0 0 0 3 0]
 [0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2]]

[[0 0 0 0 0 0 0 3 3 0]
 [0 0 0 3 0 0 0 0 3 0]
 [0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0

In [65]:
import numpy as np

In [73]:
a = np.zeros([5, 6], dtype=int); a

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
def convert_coord_to_matrix(point, matrix):
    return (matrix.shape[0] - point[1] -1, point[0])

In [74]:
point = [3, 1]

In [75]:
a[(a.shape[0] - point[1] -1, point[0])] = 1

In [76]:
a

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [53]:
class MazeMultiAgentEnv(gym.Env):
    def __init__(self, width=10, height=12):
        self.width = width
        self.height = height
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(low=0,
                                            high=2,
                                            shape=(height, width),
                                            dtype=np.int16)
        self.reward_range = (-200, 200)
        self.current_episode = 0
        self.success_episode = []

    def reset(self):
        self.current_step = 0
        self.max_step = 30
        
        self.state = "P"
        
        self.world = np.zeros((self.height, self.width), dtype=int)
        
        
        while True:
            self.player_one_location = [np.random.randint(self.height-2), np.random.randint(self.width-2)]
            if self.world[self.player_one_location[0], self.player_one_location[1]] == 0:
                self.world[self.player_one_location[0], self.player_one_location[1]] == 1
                break
                
        while True:
            self.player_two_location = [np.random.randint(self.height-2), np.random.randint(self.width-2)]
            if self.world[self.player_two_location[0], self.player_two_location[1]] == 0:
                self.world[self.player_two_location[0], self.player_two_location[1]] == 1
                break        
        
        
        self.exit_location = [self.height-1, self.width-1]
        
        self.world[self.player_location[0], self.player_location[1]] = 1
        self.world[self.exit_location[0], self.exit_location[1]] = 2
        
        return self.world
    
    
    
    def step(self, action):
        if action == 3 and self.player_location[1] != 0 : # left
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[1] -= 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 2 and self.player_location[0] != (self.height-1) : # down
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[0] += 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 1 and self.player_location[1] != (self.width-1) : # right
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[1] += 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        elif action == 0 and self.player_location[0] != 0 : # up
            self.world[self.player_location[0], self.player_location[1]] = 0
            self.player_location[0] -= 1
            self.world[self.player_location[0], self.player_location[1]] = 1
            
        self.current_step += 1
        
        if self.player_location == self.exit_location:
            self.state = "W"
            print(f'You won!')
            reward = 200
            done = True
        elif self.current_step == self.max_step:
            self.state = "L"
            print(f'You lost')
            reward = -200
            done = True    
        elif self.state == 'P':
            reward = -1
            done = False
        
        obs = self.world
        return obs, reward, done, {}
    
    def render(self):
        print(self.world)
        

In [54]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2


env = DummyVecEnv([lambda: MazeEnv()])
model = PPO2(MlpPolicy, env, learning_rate=0.001)
model.learn(500000)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
[[1 0 0 2]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[1 0 0 2]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 1 0 2]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 1 2 0]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 1 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 1 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 2 0 0]
 [0 0 1 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 2 0 0]
 [0 1 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 0 2 0]
 [0 1 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 1 2 0]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[0 1 2 0]
 [0 0 0 0]
 [0 3 4 3]
 [0 4 0 0]]
[[1 0 2 0]
 [0 0 0 0]

FileNotFoundError: [Errno 2] No such file or directory: 'render/render.txt'

In [82]:
10 / 10_000

0.001

In [95]:
np.random.random()

0.9913651379178319

### Test FOV

In [12]:
import numpy as np

In [13]:
def update_fov(some_grid, point):
    player = (2, 2)
    x = point[0] - player[0]
    y = player[1] - point[1]
    some_grid[2+y, 2+x] = 1
    return some_grid

In [15]:
grid = np.zeros([5, 5])
grid[2, 2] = 9; grid
grid = update_fov(grid, (1, 2)); grid

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 1., 9., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [35]:
grid = np.zeros([2, 5, 5]); grid

array([[[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]])

In [40]:
grid[0][-1] = 1

In [41]:
grid

array([[[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]])

In [38]:
for i in range(5):
    grid[0][i, 0] = 1
grid

array([[[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]])

In [26]:
grid.flatten()

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 9., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.])

In [27]:
point = (2, 2)

In [51]:
c = list(itertools.product(range(grid.shape[0]), range(grid.shape[0]))); c

[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (1, 0),
 (1, 1),
 (1, 2),
 (1, 3),
 (1, 4),
 (2, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (2, 4),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 3),
 (3, 4),
 (4, 0),
 (4, 1),
 (4, 2),
 (4, 3),
 (4, 4)]

In [55]:
import itertools
player = (2, 2)
list1=[0, 1, 2, 3, 4]
c = list(itertools.product(list1, list1))
d = [a for a in c if abs(a[0] - player[0]) + abs(a[1] - player[1]) <= 2]
d.remove((2, 2))

# fov = [grid[i] for i in d]

In [56]:
grid

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 1., 9., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [63]:
selected_coordinates = [(i, j) for i in range(5) for j in range(5) if abs(i - player[0]) + abs(j - player[1]) <= 2]
selected_coordinates.remove((2, 2))

In [67]:
np.float64

numpy.float64

In [8]:
c = (2, 2)
t1 = (2, 4)

In [9]:
(t1[0] - c[0], c[1] - t1[1])

(0, -2)

In [99]:
grid = np.zeros([5,5]); grid


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [100]:
player = (2, 2)
point = (0, 0)

In [11]:
grid[2+-2, 2+0] = 1; grid

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 9., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [68]:
import numpy as np


class Point:
    def __init__(self, x, y, value):
        self.x = x
        self.y = y
        self.value = value
        self.coordinates = np.array([x, y], dtype=int)

    def update(self, x, y):
        self.x = x
        self.y = y
        self.coordinates = np.array([x, y], dtype=int)

In [69]:
food = Point(1, 1, 4)

In [70]:
test = food.value / 2

In [73]:
type(test)

float

In [74]:
from gym import spaces

In [77]:
test = spaces.Box(low=-1, high=1, shape=(4, ), dtype="float32")

In [2]:
from stable_baselines.common import make_vec_env
from Field.EnvironmentTest import Environment as MazeEnv

env = make_vec_env(MazeEnv, n_envs=4)



pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [7]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines.common import make_vec_env

from stable_baselines import PPO2, DQN, SAC
from Field.EnvironmentTest import Environment as MazeEnv


env = make_vec_env(MazeEnv, n_envs=4, env_kwargs={'width':10, 'height':12})

# model = DQN(MlpPolicyDQN, env, learning_rate=0.0001, verbose=1, tensorboard_log="logging")
model = PPO2(MlpPolicy, env, learning_rate=0.001, verbose=1, tensorboard_log="logging")

model.learn(50_000, log_interval=100)

# obs = env.reset()
# while True:
#     action, _states = model.predict(obs)
#     obs, rewards, dones, info = env.step(action)
#     if not env.render():
#         break

Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
--------------------------------------
| approxkl           | 0.00027311133 |
| clipfrac           | 0.0           |
| ep_len_mean        | 21            |
| ep_reward_mean     | -400          |
| explained_variance | -7.8e-05      |
| fps                | 1279          |
| n_updates          | 1             |
| policy_entropy     | 1.385964      |
| policy_loss        | -0.0016631782 |
| serial_timesteps   | 128           |
| time_elapsed       | 0             |
| total_timesteps    | 512           |
| value_loss         | 33101.582     |
--------------------------------------


<stable_baselines.ppo2.ppo2.PPO2 at 0x20814c41c88>

In [10]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.deepq.policies import MlpPolicy as MlpPolicyDQN
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines.common import make_vec_env

from stable_baselines import PPO2, DQN
from Field.EnvironmentTest import Environment as MazeEnv

# env = DummyVecEnv([lambda: MazeEnv(width=10, height=12)])
env = make_vec_env(MazeEnv, n_envs=4, env_kwargs=dict(width=10, height=12))

# model = DQN(MlpPolicyDQN, env, learning_rate=0.0001, verbose=1, tensorboard_log="logging")
model = PPO2(MlpPolicy, env, learning_rate=0.001, verbose=1)

model.learn(5_000, log_interval=1)


env = MazeEnv(width=10, height=12)
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    if not env.render():
        break

--------------------------------------
| approxkl           | 0.00023086624 |
| clipfrac           | 0.0           |
| ep_len_mean        | 22            |
| ep_reward_mean     | -329          |
| explained_variance | -8.07e-05     |
| fps                | 1422          |
| n_updates          | 1             |
| policy_entropy     | 1.386037      |
| policy_loss        | -0.0015233436 |
| serial_timesteps   | 128           |
| time_elapsed       | 0             |
| total_timesteps    | 512           |
| value_loss         | 32989.25      |
--------------------------------------
--------------------------------------
| approxkl           | 0.00020940232 |
| clipfrac           | 0.0           |
| ep_len_mean        | 21.7          |
| ep_reward_mean     | -347          |
| explained_variance | -1.9e-05      |
| fps                | 4063          |
| n_updates          | 2             |
| policy_entropy     | 1.3840895     |
| policy_loss        | -2.491707e-05 |
| serial_timesteps   | 25

In [9]:
env.render()

Render not defined for <stable_baselines.common.vec_env.dummy_vec_env.DummyVecEnv object at 0x00000208186DD308>


In [1]:
import numpy as np

In [4]:
thing = np.random.randint(0, 4, (12, 10)); thing

array([[1, 3, 0, 1, 1, 1, 1, 1, 2, 3],
       [0, 0, 2, 1, 3, 2, 1, 1, 1, 3],
       [1, 3, 1, 1, 3, 1, 0, 0, 3, 0],
       [3, 3, 0, 1, 1, 2, 1, 1, 1, 3],
       [2, 1, 0, 2, 0, 2, 2, 2, 3, 0],
       [0, 0, 3, 1, 3, 3, 3, 0, 2, 1],
       [0, 3, 0, 2, 1, 2, 3, 0, 1, 2],
       [0, 2, 2, 0, 0, 2, 3, 0, 0, 2],
       [3, 1, 1, 2, 2, 2, 2, 0, 0, 2],
       [0, 0, 0, 1, 0, 0, 3, 1, 2, 2],
       [3, 3, 3, 3, 3, 3, 0, 1, 3, 1],
       [3, 2, 1, 3, 2, 2, 0, 1, 3, 0]])