In [61]:
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras import Model
import gym
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import convolve, gaussian

import os
import io
import base64
import time
import glob
from IPython.display import HTML

%matplotlib inline

In [62]:
env = gym.make('CartPole-v0')

print(f'env : {env}')
state_shape, action_shape = env.observation_space.shape, env.action_space.shape
print(f'State shape: {state_shape}')
print(f'Action shape: {action_shape}')
print(f'action space {env.action_space} observation space : {env.observation_space}')
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n
print(state_dim)
print(n_actions)

env : <TimeLimit<CartPoleEnv<CartPole-v0>>>
State shape: (4,)
Action shape: ()
action space Discrete(2) observation space : Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
4
2


In [63]:
class ActorCriticNet(tf.keras.Model):
    def __init__(self, state_dim, n_actions):
        super().__init__()
        self.fc1 = tf.keras.layers.Dense(256,activation='relu')
        self.fc2 = tf.keras.layers.Dense(64,activation='relu')
        self.fc_act = tf.keras.layers.Dense(n_actions,activation='linear')

        self.fc_v = tf.keras.layers.Dense(1, activation = 'linear')

    def call(self,state):
        x = self.fc1(state)
        x = self.fc2(x)

        logits_act = self.fc_act(x)
        val = self.fc_v(x)

        return tf.squeeze(logits_act), tf.squeeze(val)

In [66]:
class ACAgent:
    def __init__(self, env, state_dim, n_actions, learning_rate=0.001, max_steps = 300, gamma = 0.99):
        self.env = env
        self.gamma = gamma
        self.state_dim = state_dim
        self.n_actions = n_actions
        self.max_steps = max_steps
        self.optimizer = tf.keras.optimizers.Adam(learning_rate)
       
        self.ac_net = ActorCriticNet(state_dim,n_actions)
        s = env.reset()
        _,_ = self.ac_net(s[np.newaxis])
    
    def generate_trajectory(self):
        states, actions, rewards, next_states = [], [], [], []
        state = self.env.reset()        
        #generate n_steps of trajectory:
        for t in range(self.max_steps):
            logits, _ = self.ac_net(state[np.newaxis])
            
            action_probs = tf.nn.softmax(logits, axis=-1).numpy()   
            #print(f'action_probs: {action_probs}')         
            action = np.random.choice(n_actions, p=action_probs)
            next_state, reward, done, _ = env.step(action)
            
            #update arrays
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            
            state = next_state
            if done:
                break
        
        return np.array(states,np.float32), np.array(actions,np.int32), np.array(rewards, np.float32), np.array(next_states, np.float32)

    def train_step(self, gamma = 0.99, ent_coef = 1e-2):

        states, actions, rewards, next_states = self.generate_trajectory()
        #print(f'states : {states} next states : {next_states}')
        rtg = self.reward_to_go(rewards)


        with tf.GradientTape() as tape1: #, tf.GradientTape() as tape2:
            logits, vals = self.ac_net(states)

            probs = tf.nn.softmax(logits, axis = -1)
            log_probs = tf.nn.log_softmax(logits, axis = -1)

            row_indices = tf.range(len(actions))
            indices = tf.transpose([row_indices,actions])
            log_prob_actions = tf.gather_nd(log_probs, indices)

            _, next_vals = self.ac_net(next_states)

            targets = rewards + self.gamma * next_vals
            advantage = vals - targets
           
            objective = tf.reduce_mean(log_prob_actions*advantage)
            #entropy = -tf.reduce_mean(tf.reduce_sum(probs*log_probs, axis = -1))

            loss_actor = -(objective) #  +  (entropy*ent_coef))
           
            total_loss = loss_actor #+ loss_critic

           # print(f' target = {target} advantage : {advantage} objective : {objective} entropy : {entropy}  loss actor = {loss_actor} loss critic : {loss_critic} total loss : {total_loss}')
        grads = tape1.gradient(total_loss, self.ac_net.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.ac_net.trainable_variables))

       
        return total_loss, np.sum(rewards) #next_state, reward, done


    def train_episode(self,env, n_steps=200):
        total_reward = 0
        state  = env.reset()
        for i in range(n_steps):
            state, r, done = self.train_step(env,state)
            total_reward += r
            if done:
                break
        return total_reward

In [67]:
total_rewards = []
#aca = ActorCriticAgent(env, state_dim, n_actions)
aca = ACAgent(env,state_dim,n_actions)
for episode in range(1000):
        #states, actions, rewards = generate_trajectory(env)
    loss, episode_reward = aca.train_step()
    #episode_reward = aca.train_episode(env)  
    total_rewards.append(episode_reward)
    #print(f'count {episode} : {episode_reward}')
    if episode != 0 and episode % 50 == 0:
        mean_reward = np.mean(total_rewards[-50:-1])
        print(f'episode : {episode}')
        print("mean reward:%.3f" % (mean_reward))
        if mean_reward > 200:
            break
env.close()



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

episode : 50
mean reward:23.755
episode : 100
mean reward:43.959
episode : 150
mean reward:38.306
episode : 200
mean reward:40.204
episode : 250
mean reward:59.980
episode : 300
mean reward:75.592
episode : 350
mean reward:75.082
episode : 400
mean reward:102.714
episode : 450
mean reward:128.694
episode : 500
mean reward:128.245
episode : 550
mean reward:131.735
episode : 600
mean reward:131.367
episode : 650
mean reward:146.265
episode : 700
mean reward:132.592
episode : 750
mean reward:114.000
episode : 800
mean reward:111.694
episode : 850
mean reward:129.939
episode : 900
mean reward:187.878
episode : 950
mean reward:173.122
