In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import tensorflow as tf 

from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel

 

Create the environment object with file_name as None after you run this block click on play button in Unity Editor to connect this with the agent in the environment 

In [2]:

# This is a non-blocking call that only loads the environment.
channel = EngineConfigurationChannel()

env = UnityEnvironment(file_name=None,  seed=1, side_channels=[channel])

channel.set_configuration_parameters(time_scale = 4.0) # This helps to speed up the environment N times
# Start interacting with the evironment.
env.reset()
behavior_names = env.behavior_specs.keys()



In [3]:
list( env.behavior_specs.keys())

['CARAI?team=0']

behavior_name is important to assign action to an agent

In [None]:
# We will only consider the first Behavior
behavior_name = list(env.behavior_specs)[0] 
print(f"Name of the behavior : {behavior_name}")
spec = env.behavior_specs[behavior_name]
spec

In [5]:
# Examine the number of observations per Agent
print("Number of observations : ", len(spec.observation_shapes))

# Is there a visual observation ?
# Visual observation have 3 dimensions: Height, Width and number of channels
vis_obs = any(len(shape) == 3 for shape in spec.observation_shapes)
print("Is there a visual observation ?", vis_obs)

Number of observations :  2
Is there a visual observation ? False


In [6]:
# Is the Action continuous or multi-discrete ?
if spec.is_action_continuous():
    print("The action is continuous")
if spec.is_action_discrete():
    print("The action is discrete")

# How many actions are possible ?
print(f"There are {spec.action_size} action(s)")

# For discrete actions only : How many different options does each action has ?
if spec.is_action_discrete():
    for action, branch_size in enumerate(spec.discrete_action_branches):
        print(f"Action number {action} has {branch_size} different options")
    


The action is continuous
There are 2 action(s)


In [7]:
""" 
This is just to show the output, You can see that for single agent we have two arrays as observation because
first array is the information provide by raycast and second one is input provided by us in the script 
that's why we later used numpy.hstack() to merge them as 1 observation. 
"""
env.reset()
decision_steps, terminal_steps = env.get_steps(behavior_name)
decision_steps.obs, terminal_steps

([array([[1.       , 1.       , 0.       , 0.2348219, 0.       , 0.2639548]],
        dtype=float32),
  array([[-6.1960000e-01,  0.0000000e+00,  2.1900000e-01, -8.8757544e-05,
          -3.3263044e-04]], dtype=float32)],
 <mlagents_envs.base_env.TerminalSteps at 0x1b5bea10808>)

This is just to show the demo

In [None]:
for episode in range(50):
    env.reset()
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    #print(list(decision_steps),list(terminal_steps),behavior_name)
    print(len(decision_steps),len(terminal_steps))
    tracked_agent = -1 # -1 indicates not yet tracking
    done = False # For the tracked_agent
    episode_rewards = 0 # For the tracked_agent
    while not done:
        # Track the first agent we see if not tracking 
        # Note : len(decision_steps) = [number of agents that requested a decision]
        #print("____________________________________")
        if tracked_agent == -1 and len(decision_steps) >= 1:
            tracked_agent = decision_steps.agent_id[0] 
        print(list(decision_steps),list(terminal_steps),tracked_agent)
        
        # Generate an action for all agents
        action = spec.create_random_action(len(decision_steps))
        #print("Action",action)
        # Set the actions
        env.set_actions(behavior_name, action)

        # Move the simulation forward
        env.step()

        # Get the new simulation results
        
        #print(decision_steps.obs)
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        #print(list(decision_steps),list(terminal_steps))
        #print(decision_steps.obs)
        if tracked_agent in decision_steps: # The agent requested a decision
            episode_rewards += decision_steps[tracked_agent].reward
        if tracked_agent in terminal_steps: # The agent terminated its episode
            episode_rewards += terminal_steps[tracked_agent].reward
            done = True
    print(f"Total rewards for episode {episode} is {episode_rewards}")


Our TensorFlow models for continuous action using normal distribution and critic.

In [25]:
class Continuous(tf.keras.Model):
    def __init__(self):
        super(Continuous, self).__init__()
        self.dense1 = tf.keras.layers.Dense(128, activation="relu")
        self.dense2 = tf.keras.layers.Dense(128, activation="relu")
        self.mu = tf.keras.layers.Dense(2, activation="tanh")
        self.sig = tf.keras.layers.Dense(2, activation="softplus")
        
    def call(self, inputs):
        x1 = self.dense1(inputs)

        x1 = self.dense2(x1)
        mu = self.mu(x1)
        sig = self.sig(x1)
        
        mu = tf.squeeze(mu)
        sig = tf.squeeze(sig) + 0.00001
        return tf.convert_to_tensor([mu,sig])


modelA = Continuous()   

modelV = tf.keras.Sequential()
modelV.add(tf.keras.layers.Dense(128, activation='relu'))
modelV.add(tf.keras.layers.Dense(128, activation = "relu"))
modelV.add(tf.keras.layers.Dense(1, activation = "linear"))
modelV.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.003))


In case if we have to resume the training from saved model

In [None]:
modelA = tf.keras.models.load_model('Car/actor')
modelV = tf.keras.models.load_model('Car/critic')

In [10]:


class ActorCritic:
    def __init__(self, state, env , modelActor, modelV , episodes=300, gamma = 0.99):
        self.state = state
        self.env = env
        self.episodes = episodes
        self.modelActor = modelActor
        self.modelV = modelV
        self.gamma = gamma
        self.lr =  0.0007
        self.optimizer = tf.keras.optimizers.Adam(learning_rate = self.lr)

    def record(self):
        scores, episodes = [], []
        for i in range(self.episodes):
            if i % 20 == 0:
                self.lr = self.lr * 0.99
                self.optimizer = tf.keras.optimizers.Adam(learning_rate = self.lr)
                modelV.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(self.lr))

            env.reset()
            decision_steps, terminal_steps = env.get_steps(behavior_name)
            #print(list(decision_steps),list(terminal_steps))
            tracked_agent = -1 # -1 indicates not yet tracking
            done = False # For the tracked_agent
            episode_rewards = 0 # For the tracked_agent
            reward = 0
            while not done: 
                with tf.GradientTape() as tape:
                    # Track the first agent we see if not tracking 
                    # Note : len(decision_steps) = [number of agents that requested a decision]
                    if tracked_agent == -1 and len(decision_steps) >= 1:
                        tracked_agent = decision_steps.agent_id[0] 
                    elif tracked_agent == -1:
                        break
                    observation = np.hstack((decision_steps.obs[0],decision_steps.obs[1]))
                    # Generate an action for all agents
                    logits = self.modelActor(observation)                     
                    norm = tf.compat.v1.distributions.Normal(loc=logits[0], scale=logits[1])                    
                    sample = norm.sample([1])                   
                    decision = tf.clip_by_value(sample , -1.0 , 1.0) 
                    action = tf.squeeze(decision).numpy().reshape(-1,2)
                    
                    #action = spec.create_random_action(len(decision_steps))
                    
                    #print(action)

                    # Set the actions
                    env.set_actions(behavior_name, action)

                    # Move the simulation forward
                    env.step()

                    # Get the new simulation results
                    decision_steps, terminal_steps = env.get_steps(behavior_name)
                    #print(list(decision_steps),list(terminal_steps),tracked_agent)
                    #print(np.array(decision_steps.obs).reshape(-1,8) , np.array(terminal_steps.obs).reshape(-1,8))
                    

                    
                    if tracked_agent in decision_steps: # The agent requested a decision
                        reward = decision_steps[tracked_agent].reward
                        episode_rewards += decision_steps[tracked_agent].reward
                        newObservation = np.hstack((decision_steps.obs[0],decision_steps.obs[1]))                        
                    if tracked_agent in terminal_steps: # The agent terminated its episode
                        reward = terminal_steps[tracked_agent].reward
                        episode_rewards += terminal_steps[tracked_agent].reward
                        newObservation = np.hstack((terminal_steps.obs[0],terminal_steps.obs[1]))
                        done = True
                    #print(observation, "new ",newObservation)
                    
                    prob =    norm.log_prob(decision) 
                    entropy = norm.entropy() * 0.01                    
                     
                    value = self.modelV(observation)
                    value_ = self.modelV(newObservation)
                    delta = reward + self.gamma * value_ * (1 - done)
                    advantage = delta - value
                    loss = - (prob * advantage + 0.01 * entropy)
                grads = tape.gradient(loss, self.modelActor.trainable_variables)      
                #print("delta ", delta)
                self.modelV.fit(observation, delta,epochs = 1, batch_size=1, verbose=0)                
                self.optimizer.apply_gradients(zip(grads, self.modelActor.trainable_variables))      
                observation = newObservation
                
            scores.append(episode_rewards)
            episodes.append(i)
            print("Episode: {}, Cumulative reward: {:0.2f}".format(i, episode_rewards))

        plt.plot(episodes, scores, 'b') 
        plt.show()

modelV.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.003))

obj = ActorCritic(state=[3], env=env, modelActor=modelA, modelV=modelV)


In [None]:

obj.record() 
 


In [None]:
env.close()

In [None]:
modelA.save('Car/actor') 
modelV.save('Car/critic') 