In [1]:
import random
import numpy as np
import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
env = gym.make('CartPole-v0')

print(f'env : {env}')
state_shape, action_shape = env.observation_space.shape, env.action_space.shape
print(f'State shape: {state_shape}')
print(f'Action shape: {action_shape}')
print(f'action space {env.action_space} observation space : {env.observation_space}')
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n
print(state_dim)
print(n_actions)

env : <TimeLimit<CartPoleEnv<CartPole-v0>>>
State shape: (4,)
Action shape: ()
action space Discrete(2) observation space : Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
4
2


In [3]:
class ActorCriticNet(tf.keras.Model):
    def __init__(self, state_dim, n_actions):
        super().__init__()
        self.fc1 = tf.keras.layers.Dense(256,activation='relu')
        self.fc2 = tf.keras.layers.Dense(64,activation='relu')
        self.fc_act = tf.keras.layers.Dense(n_actions,activation='linear')

        self.fc_v = tf.keras.layers.Dense(1, activation = 'linear')

    def call(self,state):
        x = self.fc1(state)
        x = self.fc2(x)

        logits_act = self.fc_act(x)
        val = self.fc_v(x)

        return tf.squeeze(logits_act), tf.squeeze(val)

In [4]:
class ActorCriticAgent:
    def __init__(self, env, state_dim, n_actions, learning_rate = 0.0001, gamma = 0.99):
        self.gamma = 0.99
        self.state_dim = state_dim
        self.n_actions = n_actions
        self.optimizer = tf.keras.optimizers.Adam(learning_rate)#learning_rate=0.001)
        
        self.ac_net = ActorCriticNet(state_dim,n_actions)
        s = env.reset()
        _,_ = self.ac_net(s[np.newaxis])
       # print(f'logits : {l} val : {v}')

    def train_step(self, env, state, ent_coef = 1e-2):

        with tf.GradientTape() as tape1:
            logits, val = self.ac_net(state[np.newaxis])

            prob = tf.nn.softmax(logits, axis = -1)
            log_prob = tf.nn.log_softmax(logits, axis = -1)

            action = np.random.choice(n_actions, p=np.array(prob))

            log_prob_action = log_prob[action]
            #print(f'logits : {logits}  val = : {val}  prob = {prob}  action : {action} log_prob : {log_prob} log_prob_actions : {log_prob_action}')

            next_state, reward, done, _ = env.step(action)

            #print(state, next_state, reward, done)

            _, next_state_val = self.ac_net(next_state[np.newaxis])
            target = reward + next_state_val
            advantage = target - val

            objective = tf.reduce_mean(log_prob_action*advantage)
            #entropy = -tf.reduce_mean(tf.reduce_sum(prob*log_prob, axis = -1))

            loss_actor = -(objective) # +  (entropy*ent_coef))
            loss_critic = tf.reduce_mean((target - val)**2)

            total_loss = loss_actor + loss_critic

           # print(f' target = {target} advantage : {advantage} objective : {objective} entropy : {entropy}  loss actor = {loss_actor} loss critic : {loss_critic} total loss : {total_loss}')
        grads = tape1.gradient(total_loss, self.ac_net.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.ac_net.trainable_variables))

        return next_state, reward, done

    def train_episode(self,env, n_steps=200):
        total_reward = 0
        state  = env.reset()
        for i in range(n_steps):
            state, r, done = self.train_step(env,state)
            total_reward += r
            if done:
                break
        return total_reward

In [5]:
total_rewards = []
aca = ActorCriticAgent(env, state_dim, n_actions)
for episode in range(1000):
        #states, actions, rewards = generate_trajectory(env)
    episode_reward = aca.train_episode(env)  
    total_rewards.append(episode_reward)
    #print(f'count {episode} : {episode_reward}')
    if episode != 0 and episode % 50 == 0:
        mean_reward = np.mean(total_rewards[-50:-1])
        print("mean reward:%.3f" % (mean_reward))
        if mean_reward > 200:
            break
env.close()



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

mean reward:24.490
mean reward:42.714
mean reward:70.449
mean reward:94.429
mean reward:109.327
mean reward:119.857
mean reward:141.143
mean reward:130.122
mean reward:101.694
mean reward:177.939
mean reward:186.735
mean reward:191.245
mean reward:191.531
mean reward:195.082
mean reward:197.959
mean reward:191.184
mean reward:197.592
mean reward:197.143


KeyboardInterrupt: 