In [1]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm


2025-06-02 10:15:38.993741: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

In [3]:
# Create the environment
env = gym.make("CartPole-v1")

In [4]:
# Set seed for experiment reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

In [5]:
# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()

The Actor and Critic will be modeled using one neural network that generates the action probabilities and Critic value respectively. This tutorial uses model subclassing to define the model.

During the forward pass, the model will take in the state as the input and will output both action probabilities and critic value \(V\), which models the state-dependent value function. The goal is to train a model that chooses actions based on a policy \(\pi\) that maximizes expected return.

Actor: Predicts the policy — a probability distribution over actions.
Critic: Predicts the value — expected future reward from a given state.
They share some common layers, then branch into two heads:

One for the policy output
One for the value function

In [6]:
class ActorCritic(tf.keras.Model):
  """Combined actor-critic network."""

  def __init__(
      self,
      num_actions: int,
      num_hidden_units: int):
    """Initialize."""
    super().__init__()

    self.common = layers.Dense(num_hidden_units, activation="relu") #A fully connected (dense) layer shared by both the actor and critic and activation="relu" introduces non-linearity.
    self.actor = layers.Dense(num_actions) #Outputs raw logits for each action (not passed through softmax here).This head is used for the actor, i.e., the policy.
    self.critic = layers.Dense(1) #Outputs a single scalar: the estimated value of the current state. This is the critic.

  def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: #inputs: A batch of states (observations).
    x = self.common(inputs) #Passes the input through the shared dense layer.
    return self.actor(x), self.critic(x) #The output x is then: Passed to the actor head → policy logits. Passed to the critic head → scalar value estimate

In [None]:
num_actions = env.action_space.n  # 2
num_hidden_units = 128

model = ActorCritic(num_actions, num_hidden_units)

NameError: name 'env' is not defined

In [8]:
num_actions


2

In [9]:
# Wrap Gym's `env.step` call as an operation in a TensorFlow function.
# This would allow it to be included in a callable TensorFlow graph.

@tf.numpy_function(Tout=[tf.float32, tf.int32, tf.int32])#Specifies the output types:tf.float32 → for the state (continuous),f.int32 → for the reward (discrete),tf.int32 → for the done flag (0 or 1)
def env_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """Returns state, reward and done flag given an action."""

  state, reward, done, truncated, info = env.step(action) #Takes an action (from the policy).
  return (state.astype(np.float32),
          np.array(reward, np.int32), 
          np.array(done, np.int32)) 


This run_episode function simulates one episode of interaction between an agent (neural network model) and an environment, collecting data needed for actor-critic training in reinforcement learning.

Goal:
To return:
action_probs: log probabilities of selected actions → used for the actor (policy) loss.

values: critic’s value predictions at each timestep → used for the critic loss.

rewards: rewards received at each timestep → used to compute returns and advantages.

In [10]:
def run_episode(
    initial_state: tf.Tensor,
    model: tf.keras.Model,
    max_steps: int) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
  """Runs a single episode to collect training data."""

  action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)

  initial_state_shape = initial_state.shape
  state = initial_state

  for t in tf.range(max_steps): #Each loop iteration is one timestep in the episode.
    # Convert state into a batched tensor (batch size = 1)
    state = tf.expand_dims(state, 0)

    # Run the model and to get action probabilities and critic value
    action_logits_t, value = model(state)

    # Sample next action from the action probability distribution
    action = tf.random.categorical(action_logits_t, 1)[0, 0]
    action_probs_t = tf.nn.softmax(action_logits_t)

    # Store critic values
    values = values.write(t, tf.squeeze(value))

    # Store log probability of the action chosen
    action_probs = action_probs.write(t, action_probs_t[0, action])

    # Apply action to the environment to get next state and reward
    state, reward, done = env_step(action)
    state.set_shape(initial_state_shape)

    # Store reward
    rewards = rewards.write(t, reward)

    if tf.cast(done, tf.bool):
      break


  action_probs = action_probs.stack()
  values = values.stack()
  rewards = rewards.stack()

  return action_probs, values, rewards


2. Compute the expected returns

In [11]:
def get_expected_return(
    rewards: tf.Tensor,
    gamma: float,
    standardize: bool = True) -> tf.Tensor:
  """Compute expected returns per timestep."""

  n = tf.shape(rewards)[0]
  returns = tf.TensorArray(dtype=tf.float32, size=n)

  # Start from the end of `rewards` and accumulate reward sums
  # into the `returns` array
  rewards = tf.cast(rewards[::-1], dtype=tf.float32)
  discounted_sum = tf.constant(0.0)
  discounted_sum_shape = discounted_sum.shape
  for i in tf.range(n):
    reward = rewards[i]
    discounted_sum = reward + gamma * discounted_sum
    discounted_sum.set_shape(discounted_sum_shape)
    returns = returns.write(i, discounted_sum)
  returns = returns.stack()[::-1]

  if standardize:
    returns = ((returns - tf.math.reduce_mean(returns)) /
               (tf.math.reduce_std(returns) + eps))

  return returns

3. The Actor-Critic loss

In [12]:
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(
    action_probs: tf.Tensor,
    values: tf.Tensor,
    returns: tf.Tensor) -> tf.Tensor:
  """Computes the combined Actor-Critic loss."""

  advantage = returns - values

  action_log_probs = tf.math.log(action_probs)
  actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)

  critic_loss = huber_loss(values, returns)

  return actor_loss + critic_loss

4. Define the training step to update parameters

In [13]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)


@tf.function
def train_step(
    initial_state: tf.Tensor,
    model: tf.keras.Model,
    optimizer: tf.keras.optimizers.Optimizer,
    gamma: float,
    max_steps_per_episode: int) -> tf.Tensor:
  """Runs a model training step."""

  with tf.GradientTape() as tape:

    # Run the model for one episode to collect training data
    action_probs, values, rewards = run_episode(
        initial_state, model, max_steps_per_episode)

    # Calculate the expected returns
    returns = get_expected_return(rewards, gamma)

    # Convert training data to appropriate TF tensor shapes
    action_probs, values, returns = [
        tf.expand_dims(x, 1) for x in [action_probs, values, returns]]

    # Calculate the loss values to update our network
    loss = compute_loss(action_probs, values, returns)

  # Compute the gradients from the loss
  grads = tape.gradient(loss, model.trainable_variables)

  # Apply the gradients to the model's parameters
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  episode_reward = tf.math.reduce_sum(rewards)

  return episode_reward

5. Run the training loop

In [14]:
%%time

min_episodes_criterion = 100
max_episodes = 10000
max_steps_per_episode = 500

# `CartPole-v1` is considered solved if average reward is >= 475 over 500
# consecutive trials
reward_threshold = 475
running_reward = 0

# The discount factor for future rewards
gamma = 0.99

# Keep the last episodes reward
episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)

t = tqdm.trange(max_episodes)
for i in t:
    # print('jay')
    initial_state= env.reset()
    initial_state = tf.constant(initial_state, dtype=tf.float32)
    episode_reward = int(train_step(
        initial_state, model, optimizer, gamma, max_steps_per_episode))

    episodes_reward.append(episode_reward)
    running_reward = statistics.mean(episodes_reward)


    t.set_postfix(
        episode_reward=episode_reward, running_reward=running_reward)

    # Show the average episode reward every 10 episodes
    if i % 10 == 0:
      pass # print(f'Episode {i}: average reward: {avg_reward}')

    if running_reward > reward_threshold and i >= min_episodes_criterion:
        break

print(f'\nSolved at episode {i}: average reward: {running_reward:.2f}!')

  0%|          | 0/10000 [00:00<?, ?it/s]2025-06-02 10:15:53.000924: W tensorflow/core/framework/op_kernel.cc:1827] INVALID_ARGUMENT: ValueError: not enough values to unpack (expected 5, got 4)
Traceback (most recent call last):

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/var/folders/kc/sg0w2qzd0z54126d1fdl7fnc0000gp/T/ipykernel_6305/3130126599.py", line 8, in env_step
    state, reward, done, truncated, info = env.step(action) #Takes an action (from the policy).

ValueError: not enough values to unpack (expected 5, got 4)


  0%|          | 0/10000 [00:03<?, ?it/s]


InvalidArgumentError: Graph execution error:

Detected at node while/PyFunc defined at (most recent call last):
  File "/usr/local/Cellar/python@3.10/3.10.16/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/local/Cellar/python@3.10/3.10.16/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/local/Cellar/python@3.10/3.10.16/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/local/Cellar/python@3.10/3.10.16/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/local/Cellar/python@3.10/3.10.16/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3077, in run_cell

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3132, in _run_cell

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3336, in run_cell_async

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3519, in run_ast_nodes

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3579, in run_code

  File "/var/folders/kc/sg0w2qzd0z54126d1fdl7fnc0000gp/T/ipykernel_6305/1788411184.py", line 1, in <module>

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2543, in run_cell_magic

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/IPython/core/magics/execution.py", line 1364, in time

  File "<timed exec>", line 21, in <module>

  File "/var/folders/kc/sg0w2qzd0z54126d1fdl7fnc0000gp/T/ipykernel_6305/131306644.py", line 16, in train_step

  File "/var/folders/kc/sg0w2qzd0z54126d1fdl7fnc0000gp/T/ipykernel_6305/13355346.py", line 14, in run_episode

  File "/var/folders/kc/sg0w2qzd0z54126d1fdl7fnc0000gp/T/ipykernel_6305/13355346.py", line 32, in run_episode

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 6, in py_function_wrapper

ValueError: not enough values to unpack (expected 5, got 4)
Traceback (most recent call last):

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/Users/jayotsana/Documents/m1Project/RLSparse/rlEnv/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/var/folders/kc/sg0w2qzd0z54126d1fdl7fnc0000gp/T/ipykernel_6305/3130126599.py", line 8, in env_step
    state, reward, done, truncated, info = env.step(action) #Takes an action (from the policy).

ValueError: not enough values to unpack (expected 5, got 4)


	 [[{{node while/PyFunc}}]] [Op:__inference_train_step_1966]

Visualization

In [15]:
# Render an episode and save as a GIF file

from IPython import display as ipythondisplay
from PIL import Image

render_env = gym.make("CartPole-v1",render_mode='rgb_array')

def render_episode(env: gym.Env, model: tf.keras.Model, max_steps: int):
  state, info = env.reset()
  state = tf.constant(state, dtype=tf.float32)
  screen = env.render()
  images = [Image.fromarray(screen)]

  for i in range(1, max_steps + 1):
    state = tf.expand_dims(state, 0)
    action_probs, _ = model(state)
    action = np.argmax(np.squeeze(action_probs))

    state, reward, done, truncated, info = env.step(action)
    state = tf.constant(state, dtype=tf.float32)

    # Render screen every 10 steps
    if i % 10 == 0:
      screen = env.render()
      images.append(Image.fromarray(screen))

    if done:
      break

  return images


# Save GIF image
images = render_episode(render_env, model, max_steps_per_episode)
image_file = 'cartpole-v1.gif'
# loop=0: loop forever, duration=1: play each frame for 1ms
images[0].save(
    image_file, save_all=True, append_images=images[1:], loop=0, duration=1)

TypeError: CartPoleEnv.__init__() got an unexpected keyword argument 'render_mode'