In [1]:
import numpy as np
import tensorflow as tf
import gym
import matplotlib.pyplot as plt

In [2]:
env_name = 'CartPole-v1'

env = gym.make(env_name)

print(f'env : {env}')
state_shape, action_shape = env.observation_space.shape, env.action_space.shape
print('State shape: {}'.format(state_shape))
print('Action shape: {}'.format(action_shape))
print(f'action space {env.action_space} observation space : {env.observation_space}')
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n
print(state_dim)
print(n_actions)

env : <TimeLimit<CartPoleEnv<CartPole-v1>>>
State shape: (4,)
Action shape: ()
action space Discrete(2) observation space : Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
4
2


In [4]:
class PolicyNet(tf.keras.Model):
    def __init__(self, n_actions):
        super(PolicyNet,self).__init__()
        self.fc1 = tf.keras.layers.Dense(128,activation='relu')
        self.fc2 = tf.keras.layers.Dense(64,activation='relu')
        self.fc3 = tf.keras.layers.Dense(n_actions)
    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        out = self.fc3(x)
        return out

In [5]:
class ReinforceAgent:
    def __init__(self, env, state_dim, n_actions, lr = 0.001, gamma = 0.99):
        self.gamma = 0.99
        self.n_actions = n_actions
        self.state_dim = state_dim
        self.env = env
        self.network = PolicyNet(self.n_actions)
        state = env.reset()
        qs = self.network(state[np.newaxis])
        self.optimizer1 = tf.keras.optimizers.Adam(learning_rate=lr)

    def reward_to_go(self,rewards):
        count = len(rewards)
        rtg = [0]*count

        rtg[count-1] = rewards[count-1]

        for i in range(count-2,-1,-1):
            rtg[i] = rewards[i] + self.gamma * rtg[i+1]
        return np.array(rtg,np.float32)
    
    def generate_trajectory(self, max_steps=500):
        states, actions, rewards = [], [], []
        state = self.env.reset()        
        #generate n_steps of trajectory:
        for t in range(max_steps):
            logits = self.network(state[np.newaxis])
            action_probs = tf.nn.softmax(logits, axis=-1).numpy()            
            action = np.random.choice(n_actions, p=action_probs[0])
            next_state, reward, done, _ = env.step(action)
            
            #update arrays
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            
            state = next_state
            if done:
                break
        
        return np.array(states,np.float32), np.array(actions,np.int32), np.array(rewards, np.float32)
        
    def train_one_episode(self, ent_coeff=0.001):
        states, actions, rewards = self.generate_trajectory()
 
        reward_to_go = self.reward_to_go(rewards)

        with tf.GradientTape() as tape:
            
            logits = self.network(states)
            logits = tf.squeeze(logits)

            probs = tf.nn.softmax(logits, -1)
            log_probs = tf.nn.log_softmax(logits, -1)

            row_indices= tf.range(actions.shape[0])
            indices = tf.transpose([row_indices, actions])
            log_probs_for_actions = tf.gather_nd(log_probs, indices)

            obj = tf.reduce_mean(log_probs_for_actions*reward_to_go)
            ent_loss = -tf.reduce_mean(tf.reduce_sum(probs*log_probs, -1))

            loss = -(obj+ent_coeff*ent_loss)

        grads = tape.gradient(loss, self.network.trainable_variables)
        self.optimizer1.apply_gradients(zip(grads, self.network.trainable_variables))
        
        return np.sum(rewards) 

    

In [6]:
total_rewards = []
total_loss = []
ra = ReinforceAgent(env, state_dim, n_actions)

for episode in range(10000):
    reward = ra.train_one_episode() 
   
    total_rewards.append(reward)

    if episode != 0 and episode % 100 == 0:
        mean_reward = np.mean(total_rewards[-100:-1])
        print(f'mean reward after {episode} episodes :{mean_reward}')
        if mean_reward > 250:
            break
env.close()



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

mean reward after 100 episodes :32.858585357666016
mean reward after 200 episodes :96.56565856933594
mean reward after 300 episodes :234.14141845703125
mean reward after 400 episodes :230.080810546875
mean reward after 500 episodes :371.7878723144531
