In [1]:
import numpy as np 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input,Dense
from tensorflow.keras.models import Model
import gym

In [2]:
tf.keras.backend.set_floatx('float32')

In [3]:
from tensorflow.keras import layers

In [4]:
import pettingzoo
from pettingzoo.mpe import simple_spread_v1

In [21]:
env =simple_spread_v1.parallel_env(max_frames=100)




In [22]:

upper_bound = 1
lower_bound = -1

In [6]:
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [7]:
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))

In [41]:
class Agent:
    def __init__(self, buffer_capacity=5, batch_size=3):
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size

        self.buffer_counter = 0

        self.state_buffer1 = np.zeros((self.buffer_capacity, 18))
        self.action_buffer1 = np.zeros((self.buffer_capacity, 2))
        self.next_state_buffer1=np.zeros((self.buffer_capacity,18))
        self.reward_buffer1=np.zeros((self.buffer_capacity,1))
        
        self.state_buffer2 = np.zeros((self.buffer_capacity, 18))
        self.action_buffer2 = np.zeros((self.buffer_capacity, 2))
        self.next_state_buffer2=np.zeros((self.buffer_capacity,18))
        self.reward_buffer2=np.zeros((self.buffer_capacity,1))
        
        self.state_buffer3 = np.zeros((self.buffer_capacity, 18))
        self.action_buffer3 = np.zeros((self.buffer_capacity, 2))
        self.next_state_buffer3=np.zeros((self.buffer_capacity,18))
        self.reward_buffer3=np.zeros((self.buffer_capacity,1))
        
        self.state_batches={'agent_0':0,'agent_1':2,'agent_2':0}
        self.action_batches={'agent_0':0,'agent_1':0,'agent_2':0}
        self.next_state_batches={'agent_0':0,'agent_1':0,'agent_2':0}
        self.reward_batches={'agent_0':0,'agent_1':0,'agent_2':0}
        
        self.state_buffers={'agent_0':self.state_buffer1,'agent_1':self.state_buffer2,'agent_2':self.state_buffer3}
        self.action_buffers={'agent_0':self.action_buffer1,'agent_1':self.action_buffer2,'agent_2':self.action_buffer3}
        self.next_state_buffers={'agent_0':self.next_state_buffer1,'agent_1':self.next_state_buffer2,'agent_2':self.next_state_buffer3}
        self.reward_buffers={'agent_0':self.reward_buffer1,'agent_1':self.reward_buffer2,'agent_2':self.reward_buffer3}
        
        self.target_actor1=get_actor()
        self.target_actor2=get_actor()
        self.target_actor3=get_actor()
        self.target_actor_models={'agent_0':self.target_actor1,'agent_1':self.target_actor2,'agent_2':self.target_actor3}
        
        self.target_critic = get_critic()
        self.critic=get_critic()
        self.target_critic.set_weights(self.critic.get_weights())
        
        self.actor1=get_actor()
        self.actor2=get_actor()
        self.actor3=get_actor()
        self.actor_models={'agent_0':self.actor1,'agent_1':self.actor2,'agent_2':self.actor3}
        
        self.critic_lr = 0.002
        self.actor_lr = 0.001
        self.gamma=0.99
        self.tau=0.005
        self.epsilon=1
        self.episode_steps=0
        self.sum_rewards = {'agent_0':0,'agent_1':0,'agent_2':0} 
        self.upper_bound=1
        self.lower_bound=-1
        self.critic_optimizer = tf.keras.optimizers.Adam(self.critic_lr)
        self.actor_optimizer = tf.keras.optimizers.Adam(self.actor_lr)
        
        for agent in env.agents:
            self.target_actor_models[agent].set_weights(self.actor_models[agent].get_weights())
    
    
    
    def record(self, obs_tuple):
        index = self.buffer_counter % self.buffer_capacity
        for agent in env.agents:
            self.state_buffers[agent][index]=obs_tuple[0][agent]
            self.action_buffers[agent][index]=obs_tuple[1][agent]
            self.reward_buffers[agent][index]=obs_tuple[2][agent]
            self.next_state_buffers[agent][index]=obs_tuple[3][agent]
        self.buffer_counter += 1
        
        

    def learn(self):

        record_range = min(self.buffer_counter, self.buffer_capacity)

        batch_indices = np.random.choice(record_range, self.batch_size)
        for agent in self.state_batches:
            self.state_batches[agent]=tf.convert_to_tensor(self.state_buffers[agent][batch_indices],dtype=tf.float32)
            self.action_batches[agent]=tf.convert_to_tensor(self.action_buffers[agent][batch_indices],dtype=tf.float32)
            self.next_state_batches[agent]=tf.convert_to_tensor(self.next_state_buffers[agent][batch_indices],dtype=tf.float32)
            self.reward_batches[agent]=tf.convert_to_tensor(self.reward_buffers[agent][batch_indices],dtype=tf.float32)
            
        self.update()
    def update(self):
        states=[]
        actions=[]
        next_states=[]
        rewards=[]
        target_actions=[]
        action_batch=[]
        for agent in env.agents:
            states.append(self.state_batches[agent])
            actions.append(self.action_batches[agent])
            next_states.append(self.next_state_batches[agent])
            rewards.append(self.reward_batches[agent])
            
        with tf.GradientTape() as tape:
            for agent in env.agents:
                actions_=self.target_actor_models[agent](self.next_state_batches[agent], training=True)
                actions_=tf.reshape(actions_,(self.batch_size,-1))
                target_actions.append(actions_)
            y = rewards + self.gamma * self.target_critic([next_states, target_actions], training=True)
            critic_value = self.critic([states, actions], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
            critic_grad = tape.gradient(critic_loss,self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
            zip(critic_grad,self.critic.trainable_variables)
        )
        with tf.GradientTape(persistent=True) as tape:
            for agent in env.agents:
                action_batch.append(self.actor_models[agent](self.state_batches[agent], training=True))
            
            critic_value = self.critic([states, action_batch], training=True)
            actor_loss = -tf.math.reduce_mean(critic_value)
            
            for agent in env.agents:
                actor_grad = tape.gradient(actor_loss,self.actor_models[agent].trainable_variables)
                self.actor_optimizer.apply_gradients(zip(actor_grad,self.actor_models[agent].trainable_variables))
    
                                    
                                    
    def policy(self,agent,state,noise_object):
        state = tf.expand_dims(tf.convert_to_tensor(state), 0)
        sampled_actions = tf.squeeze(self.actor_models[agent](state))
        noise = noise_object()
        sampled_actions = sampled_actions.numpy() + noise
        sampled_actions=np.clip(sampled_actions,self.lower_bound,self.upper_bound)
        return sampled_actions
    
    @tf.function
    def update_target(self,target_weights, weights):
        for (a, b) in zip(target_weights, weights):
            a.assign(b * self.tau + a * (1 - self.tau))
    

In [42]:
def get_critic():
    tf.keras.backend.set_floatx('float64')
    state_input1=layers.Input(shape=(18))
    state_input2=layers.Input(shape=(18))
    state_input3=layers.Input(shape=(18))
    state_input=layers.Concatenate()([state_input1,state_input2,state_input3])
    state_output=layers.Dense(32,activation='relu')(state_input)
    
    
    action_input=layers.Input(shape=(2))
    
    action_input1=layers.Input(shape=(2))
    action_input2=layers.Input(shape=(2))
    action_input3=layers.Input(shape=(2))
    action_input=layers.Concatenate()([action_input1,action_input2,action_input3])
    action_output=layers.Dense(32,activation='relu')(action_input)

    concat=layers.Concatenate()([state_output,action_output])
    x=layers.Dense(256,activation='relu')(concat)
    
    x=layers.Dense(256,activation='relu')(x)
    x=layers.Dense(1,activation='linear')(x)
    model=tf.keras.Model([state_input1,state_input2,state_input3,action_input1,action_input2,action_input3],x)
    return model

In [43]:
def get_actor():
    tf.keras.backend.set_floatx('float64')
    last_init=tf.random_uniform_initializer(minval=-0.003,maxval=0.003)
    i=layers.Input(shape=(18))
    x=layers.Dense(256,activation='relu')(i)
    x=layers.Dense(256,activation='relu')(x)
    x=layers.Dense(2,activation='tanh')(x)
    model=tf.keras.Model(i,x)
    return model

In [44]:
agent1=Agent(50000,64)

In [45]:
tf.keras.backend.set_floatx('float32')

In [47]:
ep_reward_list = []

episode_steps=[]

for episode in range(1000):

    last_states = env.reset()
    agent1.sum_rewards = {'agent_0':0,'agent_1':0,'agent_2':0}
    agent1.episode_steps=0
    while True:
   
        actions = {agent: agent1.policy(agent,last_states[agent],ou_noise) for agent in env.agents}
        next_states,rewards,done,info=env.step(actions)
        
             
        agent1.sum_rewards={agent: (agent1.sum_rewards[agent]+rewards[agent]) for agent in rewards}
        agent1.episode_steps+=1
        
        agent1.record((last_states,actions,rewards,next_states))
        agent1.learn()
   
        
        for agent in rewards:
            agent1.update_target(agent1.target_actor_models[agent].variables,agent1.actor_models[agent].variables)
        
        agent1.update_target(agent1.target_critic.variables,agent1.critic.variables)
        
        if done['agent_0']:
            break
        last_states=next_states
    
    print("EPISODE ",episode," STEPS = ",agent1.episode_steps)
    print("REWARD ",agent1.sum_rewards)
        
    ep_reward_list.append(agent1.sum_rewards)
    episode_steps.append(agent1.episode_steps)
        

EPISODE  0  STEPS =  102
REWARD  {'agent_0': -3439.9526617900597, 'agent_1': -3440.4526617900597, 'agent_2': -3439.9526617900597}
EPISODE  1  STEPS =  102
REWARD  {'agent_0': -2037.0331480191853, 'agent_1': -2037.0331480191853, 'agent_2': -2037.0331480191853}
EPISODE  2  STEPS =  102
REWARD  {'agent_0': -2062.8346428814857, 'agent_1': -2062.8346428814857, 'agent_2': -2062.8346428814857}


KeyboardInterrupt: 