In [4]:
import tensorflow as tf 
import tensorflow_probability as tfp 
from keras import layers
from keras.models import Model 
from keras.optimizers import Adam
import gym

In [5]:
class ActorCritic(tf.keras.Model):
  def __init__(self, num_actions=2, num_hidden_units=1024):
    super(ActorCritic, self).__init__() 
    self.num_actions = num_actions
    self.shared_1 = layers.Dense(num_hidden_units, activation = "relu")
    self.shared_2 = layers.Dense(num_hidden_units, activation = "relu")
    self.actor = layers.Dense(num_actions, "softmax")
    self.critic = layers.Dense(1, None)

  def call(self, input_obs):
    x = self.shared_1(input_obs)
    x = self.shared_2(x)
    v = self.critic(x)
    pol = self.actor(v)
    return v, pol


In [20]:
class Agent():
  def __init__(self, num_actions=2, alpha = 0.001, gamma = 0.99):
    self.gamma = gamma
    self.aplha = alpha 
    self.num_actions = num_actions 
    self.action = None
    self.network = ActorCritic(num_actions=self.num_actions)
    self.network.compile(optimizer = Adam(learning_rate = alpha))
    self.optimizer = Adam(learning_rate = alpha)
  def choose_action(self, obs): 
    tf_obs = tf.convert_to_tensor([obs])
    _, action_logits = self.network.call(tf_obs)
    discrete_dist = tfp.distributions.Categorical(logits=action_logits)
    action = discrete_dist.sample() #This returns a tensor, 
    self.action = action
    return action.numpy()[0]
  
  def save_models(self):
    print(">>>>savingModel<<<<")
    self.network.save_weights("actorcritic.h5")

  def load_models(self):
    print(">>>>Load Model<<<<")
    self.network.load_weights("actorcritic.h5")
  
  def learn(self, state, reward, next_state, done):
    tf_state = tf.convert_to_tensor([state], dtype = tf.float32)
    tf_reward = tf.convert_to_tensor(reward, dtype = tf.float32)
    tf_next_state = tf.convert_to_tensor([next_state], dtype = tf.float32)

    with tf.GradientTape() as tape:
      state_val, action_logits = self.network.call(tf_state)
      next_state_val, next_action_logits = self.network.call(tf_next_state)

      state_val = tf.squeeze(state_val)
      next_state_val = tf.squeeze(next_state_val)
      delta = tf_reward + self.gamma*next_state_val*(1-int(done)) - state_val  #int(done) == 0 when the round is not done

      loss_critic = delta**2 
      discrete_dist = tfp.distributions.Categorical(logits=action_logits)
      log_prob = discrete_dist.log_prob(self.action)

      loss_actor = -log_prob*delta
      total_loss = loss_actor + loss_critic
      grads = tape.gradient(total_loss, self.network.trainable_variables)
      self.optimizer.apply_gradients(zip(grads, self.network.trainable_variables))

In [7]:
env = gym.make('CartPole-v0')


  logger.warn(
  deprecation(
  deprecation(


In [None]:
eps = 2000 
agent = Agent()
for i in range(eps):
  state = env.reset()
  done = False
  total_reward = 0
  while not done:
    action = agent.choose_action(state)
    next_state, reward, done, _ = env.step(action)
    total_reward += reward
    agent.learn(state, reward, next_state, done)
    state = next_state

  if i%20 == 0: 
    print(total_reward)

19.0
60.0
18.0
22.0
16.0
23.0
19.0
17.0
30.0
16.0
11.0
