In [None]:
import gym
import tensorflow as tf
import numpy as np

In [2]:
def make_model(input_shape, outputs, neurons, out_activation):
    return tf.keras.Sequential([
        tf.keras.layers.Input(input_shape),
        tf.keras.layers.Dense(neurons, activation='tanh'),
        tf.keras.layers.Dense(neurons, activation='tanh'),
        tf.keras.layers.Dense(outputs, activation=out_activation)
    ])

In [3]:
class Critic:
    def __init__(self, input_shape, neurons, lr):
        self.model = make_model(input_shape, 1, neurons, None)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    
    @tf.function
    def advantage(self, obs, obs_last, rewards, done, discount):
        v = self.model(obs)
        v_last = self.model(tf.reshape(obs_last, (1, -1))) * (1 - tf.cast(done, tf.float32))
        v = v[:,0]
        v_last = v_last[:,0]
        vn = tf.concat([v[1:], v_last], axis=0)

        adv = rewards + tf.stop_gradient(discount * vn) - v

        return adv

    def minimize(self, loss, tape):
        self.optimizer.minimize(loss, self.model.trainable_variables, tape=tape)

In [4]:
class DiscreteActor:
    def __init__(self, input_shape, actions, neurons, lr):
        self.model = make_model(input_shape, actions, neurons, 'softmax')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    @tf.function
    def prob(self, obs, actions):
        return self.probs_prob(obs, actions)[1]
    
    @tf.function
    def probs_prob(self, obs, actions):
        outs = self.model(obs)
        actions = tf.reshape(actions, (-1, 1))
        return outs, tf.gather_nd(outs, actions, batch_dims=1)

    def act(self, obs, explore):
        out = self.model(obs.reshape((1, -1))).numpy()[0]
        if explore:
            return np.random.choice(out.shape[0], p=out)
        else:
            return np.argmax(out)

    def minimize(self, loss, tape):
        self.optimizer.minimize(loss, self.model.trainable_variables, tape=tape)

In [None]:
def test_actor(actor, test_env, num, show=False):
    rewards_sum = 0
    for _ in range(num):
        obs = test_env.reset()
        done = False
        
        if show:
            test_env.render()

        while not done:
            action = actor.act(obs, explore=False)
            obs, reward, done, _ = test_env.step(action)

            if show:
                test_env.render()

            rewards_sum += reward
    return rewards_sum / num

In [None]:
env = gym.make('CartPole-v1')

In [None]:
env.observation_space

In [None]:
env.action_space

In [None]:
env.reset()

In [None]:
env.render()

In [None]:
train_env = gym.make('CartPole-v1')
test_env = gym.make('CartPole-v1')

In [None]:
def train_agent_aac(train_env, test_env, max_size, T, neurons, lr, discount):
    actor = DiscreteActor(train_env.observation_space.shape, train_env.action_space.n, neurons, lr)
    critic = Critic(train_env.observation_space.shape, neurons, lr)
    
    data = []
    
    total_critic_loss = 0
    
    rewards = test_actor(actor, test_env, 5)
    print(0, rewards, sep='\t')

    obs = train_env.reset()
    
    for step in range(max_size):
        action = actor.act(obs, explore=True)
        next_obs, reward, done, info = train_env.step(action)
        
        data.append({'obs': obs, 'action': action, 'reward': reward})
        
        if len(data) == T or done:
            actions = np.array([d['action'] for d in data])
            observations = np.array([d['obs'] for d in data], np.float32)
            rewards = np.array([d['reward'] for d in data], np.float32)
            
            is_done = done and not info.get('TimeLimit.truncated', False)
            
            with tf.GradientTape(persistent=True) as tape:
                advantage = critic.advantage(observations, next_obs, rewards, is_done, discount)
                prob = actor.prob(observations, actions)
                
                critic_loss = tf.reduce_mean(advantage ** 2)
                actor_loss = tf.reduce_mean(-tf.math.log(prob) * advantage)

            actor.minimize(actor_loss, tape)
            critic.minimize(critic_loss, tape)
            
            total_critic_loss += critic_loss.numpy()
            data = []

        if done:
            obs = train_env.reset()
        else:
            obs = next_obs

        if (step + 1) % 1000 == 0:
            rewards = test_actor(actor, test_env, 5)
            print(step, rewards, total_critic_loss / 1000, sep='\t')
            total_critic_loss = 0

    return actor

In [None]:
actor = train_agent_aac(train_env, test_env, 100000, 25, 64, 0.001, 0.6)

In [None]:
test_actor(actor, env, 5, show=True)

In [None]:
env.close()

In [6]:
from pettingzoo.mpe import simple_spread_v2

In [None]:
env = simple_spread_v2.env()

In [None]:
env.reset()

In [None]:
env.agents

In [None]:
env.observation_space(env.agents[2])

In [None]:
env.render()

In [None]:
env.reset()
env.render()

for agent in env.agent_iter():
    obs, reward, done, _  = env.last()
    action = env.action_space(agent).sample()
    env.step(None if done else action)
    env.render()

In [7]:
def test_multiagent(actors, env, n, show=False):
    rewards = 0
    for _ in range(n):
        env.reset()
        if show:
            env.render()

        for agent in env.agent_iter():
            obs, reward, done, _  = env.last()
            action = actors[agent].act(obs, explore=False)
            env.step(None if done else action)
            if show:
                env.render()
            rewards += reward
    return rewards / n

In [None]:
def train_agent_iaac(train_env, test_env, max_size, T, neurons, lr, discount):
    train_env.reset()
    agents = train_env.agents
    actors = {
        agent: DiscreteActor(train_env.observation_space(agent).shape, train_env.action_space(agent).n, neurons, lr)
        for agent in agents
    }
    critics = {
        agent: Critic(train_env.observation_space(agent).shape, neurons, lr)
        for agent in agents
    }
    
    data = []
    
    total_critic_loss = 0
    
    rewards = test_multiagent(actors, test_env, 5)
    print(0, rewards, sep='\t')

    obs = {agent: train_env.observe(agent) for agent in agents}
    
    for step in range(max_size):
        actions = {}
        for _ in range(len(agents)):
            agent = train_env.agent_selection
            action = actors[agent].act(obs[agent], explore=True)
            train_env.step(action)
            actions[agent] = action

        next_obs = {agent: train_env.observe(agent) for agent in agents}
        rewards = train_env.rewards
        dones = train_env.dones
        
        data.append({'obs': obs, 'actions': actions, 'rewards': rewards})
        
        if len(data) == T or all(dones.values()):
            for agent in agents:
                actor = actors[agent]
                critic = critics[agent]
                
                actions = np.array([d['actions'][agent] for d in data])
                observations = np.array([d['obs'][agent] for d in data], np.float32)
                rewards = np.array([d['rewards'][agent] for d in data], np.float32)
                next_observations = next_obs[agent]

                is_done = False

                with tf.GradientTape(persistent=True) as tape:
                    advantage = critic.advantage(observations, next_observations, rewards, is_done, discount)
                    prob = actor.prob(observations, actions)

                    critic_loss = tf.reduce_mean(advantage ** 2)
                    actor_loss = tf.reduce_mean(-tf.math.log(prob) * advantage)

                actor.minimize(actor_loss, tape)
                critic.minimize(critic_loss, tape)
            
                total_critic_loss += critic_loss.numpy()
            data = []

        if all(dones.values()):
            train_env.reset()
            obs = {agent: train_env.observe(agent) for agent in agents}
        else:
            obs = next_obs

        if (step + 1) % 1000 == 0:
            rewards = test_multiagent(actors, test_env, 5)
            print(step, rewards, total_critic_loss / 1000, sep='\t')
            total_critic_loss = 0

    return actors

In [8]:
train_env = simple_spread_v2.env()
test_env = simple_spread_v2.env()

In [None]:
actors = train_agent_iaac(train_env, test_env, 50000, 25, 32, 0.001, 0.6)

In [None]:
done

In [None]:
def train_agent_caac(train_env, test_env, max_size, T, neurons, lr, discount):
    train_env.reset()
    agents = train_env.agents
    actors = {
        agent: DiscreteActor(train_env.observation_space(agent).shape, train_env.action_space(agent).n, neurons, lr)
        for agent in agents
    }
    critic = Critic((sum(train_env.observation_space(agent).shape[0] for agent in agents),), neurons, lr)
    
    data = []
    
    total_critic_loss = 0
    
    rewards = test_multiagent(actors, test_env, 5)
    print(0, rewards, sep='\t')

    obs = {agent: train_env.observe(agent) for agent in agents}
    
    for step in range(max_size):
        actions = {}
        for _ in range(len(agents)):
            agent = train_env.agent_selection
            action = actors[agent].act(obs[agent], explore=True)
            train_env.step(action)
            actions[agent] = action

        next_obs = {agent: train_env.observe(agent) for agent in agents}
        rewards = train_env.rewards
        dones = train_env.dones
        
        data.append({'obs': obs, 'actions': actions, 'rewards': rewards})
        
        if len(data) == T or all(dones.values()):
            join_observations = np.array([
                np.concatenate([d['obs'][agent] for agent in agents])
                for d in data], np.float32
            )
            join_next_observations = np.concatenate([obs[agent] for agent in agents])
            rewards = np.array([sum(d['rewards'][agent] for agent in agents) for d in data], np.float32)
            is_done = False

            with tf.GradientTape() as tape:
                advantage = critic.advantage(join_observations, join_next_observations, rewards, is_done, discount)
                critic_loss = tf.reduce_mean(advantage ** 2)

            critic.minimize(critic_loss, tape)
            total_critic_loss += critic_loss.numpy()
            
            for agent in agents:
                actor = actors[agent]
                
                actions = np.array([d['actions'][agent] for d in data])
                observations = np.array([d['obs'][agent] for d in data], np.float32)

                with tf.GradientTape(persistent=True) as tape:
                    prob = actor.prob(observations, actions)
                    actor_loss = tf.reduce_mean(-tf.math.log(prob) * advantage)

                actor.minimize(actor_loss, tape)
    
            data = []

        if all(dones.values()):
            train_env.reset()
            obs = {agent: train_env.observe(agent) for agent in agents}
        else:
            obs = next_obs

        if (step + 1) % 1000 == 0:
            rewards = test_multiagent(actors, test_env, 5)
            print(step, rewards, total_critic_loss / 1000, sep='\t')
            total_critic_loss = 0

    return actors

In [None]:
actors = train_agent_caac(train_env, test_env, 30000, 25, 32, 0.001, 0.6)

In [None]:
test_multiagent(actors, env, 5, True)

In [19]:
def train_agent_maac(train_env, test_env, max_size, T, neurons, lr, discount, alpha, beta):
    train_env.reset()
    agents = train_env.agents
    actors = {
        agent: DiscreteActor(train_env.observation_space(agent).shape, train_env.action_space(agent).n, neurons, lr)
        for agent in agents
    }
    critics = {
        agent: Critic((
            sum(train_env.observation_space(agent).shape[0] for agent in agents),
        ), neurons, lr)
        for agent in agents
    }
    
    data = []
    
    total_critic_loss = 0
    
    rewards = test_multiagent(actors, test_env, 5)
    print(0, rewards, sep='\t')

    obs = {agent: train_env.observe(agent) for agent in agents}
    
    for step in range(max_size):
        actions = {}
        for _ in range(len(agents)):
            agent = train_env.agent_selection
            action = actors[agent].act(obs[agent], explore=True)
            train_env.step(action)
            actions[agent] = action

        next_obs = {agent: train_env.observe(agent) for agent in agents}
        rewards = train_env.rewards
        dones = train_env.dones
        
        data.append({'obs': obs, 'actions': actions, 'rewards': rewards})
        
        if len(data) == T or all(dones.values()):
            for agent in agents:
                weights = np.array([1 if a == agent else alpha for a in agents])
                
                actor = actors[agent]
                critic = critics[agent]
                
                actions = np.array([d['actions'][agent] for d in data])
                join_observations = np.array([
                    np.concatenate([d['obs'][a] * w for a, w in zip(agents, weights)]) for d in data
                ], np.float32)
                observations = np.array([d['obs'][agent] for d in data], np.float32)

                rewards = np.array([
                    sum(d['rewards'][a] * w for a, w in zip(agents, weights)) for d in data
                ], np.float32)

                next_observations = np.concatenate([next_obs[a] * w for a, w in zip(agents, weights)])
                
                is_done = False

                with tf.GradientTape(persistent=True) as tape:
                    advantage = critic.advantage(join_observations, next_observations, rewards, is_done, discount)
                    probs, prob = actor.probs_prob(observations, actions)

                    critic_loss = tf.reduce_mean(advantage ** 2)
                    entropy_reward = beta * tf.reduce_sum(probs * tf.math.log(probs))
                    actor_loss = tf.reduce_mean(-tf.math.log(prob) * advantage) + tf.reduce_mean(entropy_reward)

                actor.minimize(actor_loss, tape)
                critic.minimize(critic_loss, tape)
            
                total_critic_loss += critic_loss.numpy()
            data = []

        if all(dones.values()):
            train_env.reset()
            obs = {agent: train_env.observe(agent) for agent in agents}
        else:
            obs = next_obs

        if (step + 1) % 1000 == 0:
            rewards = test_multiagent(actors, test_env, 5)
            print(step, rewards, total_critic_loss / 1000, sep='\t')
            total_critic_loss = 0

    return actors

In [14]:
actors = train_agent_maac(train_env, test_env, 30000, 25, 32, 0.001, 0.6, 0.5)

0	-134.8897738898681
999	-129.13553212949284	1.2905029108524322
1999	-128.98563809983168	1.4688297519683837
2999	-143.09220432865473	0.990796396613121
3999	-159.68138847413545	0.9376717278957367
4999	-124.61196408413836	0.5388937526494264
5999	-135.32937215010855	0.4867990537881851
6999	-148.6854266184112	0.4052819287488237
7999	-120.15791518577217	0.1562084420444444
8999	-122.16176772873166	0.06673483439628035
9999	-113.89447278377565	0.08275051973224617
10999	-128.32040651694618	0.06665869240323082
11999	-112.32564362998139	0.07799159152293578
12999	-119.91842157797491	0.09450711764302104
13999	-111.93774535607285	0.05881499941088259
14999	-121.31306001617388	0.05705398432444781
15999	-129.8663460574944	0.036655323027167466
16999	-107.3982910533615	0.05569755196385086
17999	-123.82905779740133	0.03605788265122101
18999	-113.90873900679318	0.03762549691088497
19999	-113.21296514046512	0.037043443861417474
20999	-114.95416567078514	0.044506456528324634
21999	-115.40773785526122	0.04588

In [17]:
test_multiagent(actors, test_env, 5, True)

-94.62816200230067

In [21]:
actors = train_agent_maac(train_env, test_env, 50000, 25, 32, 0.001, 0.6, 0.5, 0.01)

0	-168.55922004422385
999	-135.77024016232767	1.7445443429946899
1999	-113.6593074224512	1.2962639441490174
2999	-157.97933790876465	0.9959262154698372
3999	-149.08666353911335	0.8351412367224693
4999	-142.59356224381554	0.6728708162903786
5999	-133.11243207695796	0.36074586003646253
6999	-145.16797760445826	0.30684090907638895
7999	-158.6709397320401	0.13365346635971218
8999	-117.5462293117617	0.14543445369135588
9999	-116.74002527252955	0.09946269047586247
10999	-143.0268657272883	0.05241031979955733
11999	-127.25677439194615	0.09075423493329436
12999	-110.44828184685218	0.06247028256300837
13999	-109.40661180612801	0.06855511072138325
14999	-129.34704121854003	0.08645762720936909
15999	-113.89474610017248	0.06476798332203179
16999	-108.45280691605039	0.08005596605082974
17999	-104.26223581172681	0.1163931976344902
18999	-115.76644997196999	0.06108395947841928
19999	-119.24109556268736	0.05816329188272357
20999	-121.86490348728906	0.056493203070713205
21999	-123.82078853419151	0.0767

In [22]:
test_multiagent(actors, test_env, 5, True)

-106.09781848248687