In [1]:
import numpy as np

In [2]:
import gymnasium as gym

The classic cart pole problem is an introduction to Reinforcement Learning. \
An agent learns to balance a pole on a cart by moving it left or right 

In [3]:
env = gym.make('CartPole-v1', render_mode=None)
env.reset(seed=42)

(array([ 0.0273956 , -0.00611216,  0.03585979,  0.0197368 ], dtype=float32),
 {})

The observation space includes the cart position and velocity. \
It also includes the pole's angle (in radians) and velocity

In [4]:
print(f"Observation space: {env.observation_space}")

Observation space: Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)


The agent has an action space which specifies what it can do \
It can move left or right \
to balance the angle and velocity of the pole.

In [5]:
print(f"Action space: {env.action_space}")

Action space: Discrete(2)


## The Environment

Here, we let the agent move 100 times \
At the end, we print its final position, velocity and the pole's angle and velocity 

In [6]:
# min_cart_vel, max_cart_vel = float('inf'), float('-inf')
# min_pole_vel, max_pole_vel = float('inf'), float('-inf')

# for _ in range(1000):  # Run 1000 random steps
#     action = env.action_space.sample()
#     observation, _, done, _, _ = env.step(action)
    
#     cart_vel, pole_vel = observation[1], observation[3]
#     min_cart_vel = min(min_cart_vel, cart_vel)
#     max_cart_vel = max(max_cart_vel, cart_vel)
#     min_pole_vel = min(min_pole_vel, pole_vel)
#     max_pole_vel = max(max_pole_vel, pole_vel)
    
#     if done:
#         observation, _ = env.reset()

# print(f"Cart velocity range: ({min_cart_vel:.2f}, {max_cart_vel:.2f})")
# print(f"Pole velocity range: ({min_pole_vel:.2f}, {max_pole_vel:.2f})")

## The State Discretization

The action state originally involves continuous values. \
We will turn those into discrete values to limit the number of actions an agent can take \
It will make learning easier.

In [6]:
n_buckets = (5, 8, 10, 5) # cart position, cart velocity, pole angle, pole velocity

state_bounds = list(zip(env.observation_space.low, env.observation_space.high))
state_bounds[1] = (-1.5, 1.5) # cart velocity bounds
state_bounds[3] = (-2.0, 2.0) # pole velocity bounds

In [10]:
state_bounds

[(np.float32(-4.8), np.float32(4.8)),
 (-1.5, 1.5),
 (np.float32(-0.41887903), np.float32(0.41887903)),
 (-2.0, 2.0)]

In [14]:
env.unwrapped.spec

EnvSpec(id='CartPole-v1', entry_point='gymnasium.envs.classic_control.cartpole:CartPoleEnv', reward_threshold=475.0, nondeterministic=False, max_episode_steps=None, order_enforce=False, disable_env_checker=True, kwargs={'render_mode': None}, namespace=None, name='CartPole', version=1, additional_wrappers=(), vector_entry_point='gymnasium.envs.classic_control.cartpole:CartPoleVectorEnv')

In [None]:
def discretize_state(observation):
    """
    Convert a continuous state to its discrete counterpart
    """
    discretized = []
    for i, (lower, upper) in enumerate(state_bounds):
        if upper == float('inf'):
            upper = env.unwrapped.spec.kwargs.get('thresholds', [2.4, 2.4, 0.418, 0.418])[i]
        if lower == float('-inf'):
            lower = -upper
        
        scaling = (n_buckets[i] - 1) / (upper - lower)
        new_obs = int(
            np.floor(scaling * (observation[i] - lower))
            )
        new_obs = min(n_buckets[i] - 1, max(0, new_obs))
        discretized.append(new_obs)
    return tuple(discretized)

## Q-Table initialization

In [9]:
def create_q_table():
    """
    Create and initialize Q-table with zeros
    """
    q_table_shape = n_buckets + (env.action_space.n,)
    q_table = np.zeros(q_table_shape)
    return q_table

## Epsilon-greedy action selection

In [10]:
def select_action(state, q_table, epsilon):
    """
    Select an action using epsilon-greedy policy
    """
    if np.random.random() < epsilon:
        # explore: select a random action
        return env.action_space.sample()
    else:
        # exploit: select action with the highest q-value
        return np.argmax(q_table[state])

## Q-learning update rule

In [11]:
def update_q_value(state, action, reward, next_state, q_table, alpha, gamma):
    """
    Update Q-value for a state-action pair
    """
    next_max_q = np.max(q_table[next_state])
    # current q = 100
    # next max q = 70
    # reward 20
    current_q = q_table[state][action]
    new_q = current_q + alpha * (reward + gamma * next_max_q - current_q)
    q_table[state][action] = new_q

## Training Loop

In [12]:
def train_agent(n_episodes, alpha, gamma, epsilon_start, epsilon_end, epsilon_decay):
    """
    Train the Q-learning agent
    """
    q_table = create_q_table()

    total_episode_states = []

    for episode in range(n_episodes):
        episode_state = {}

        # Reduce epsilon (exploration rate) over time
        epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-episode / epsilon_decay)
        episode_state['epsilon'] = epsilon

        # reset environment
        observation, info = env.reset()
        state = discretize_state(observation)
        
        done = False
        total_reward = 0
        episode_length = 0
        # One episode of training
        while not done:
            action = select_action(state, q_table, epsilon)

            # Take action and observe the result
            next_observation, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            next_state = discretize_state(next_observation)

            update_q_value(state, action, reward, next_state, q_table, alpha, gamma)

            # move to the next state
            state = next_state
            total_reward += reward
            episode_length += 1
        
        episode_state['total_reward'] = total_reward
        episode_state['episode_length'] = episode_length
        total_episode_states.append(episode_state)

        if episode % 50 == 0:
            last_50_episode_results = total_episode_states[episode-50:]
            mean_reward = sum([past_episode['total_reward'] for past_episode in last_50_episode_results]) / 50
            print(f"Episode {episode}, Average reward: {mean_reward}, Epsilon: {epsilon:.4f}")

        

    return q_table, total_episode_states

## Testing the trained agent

In [13]:
def test_agent(q_table, n_episodes=10, render=True):
    """
    Test the trained Q-learning agent over several episodes
    
    Args:
        q_table: The learned Q-table
        n_episodes: Number of test episodes to run
        render: Whether to render the environment (set to True to visualize)
    
    Returns:
        Average episode length across all test episodes
    """
    env_test = gym.make('CartPole-v1', render_mode='human' if render else None)
    episode_lengths = []
    
    for episode in range(n_episodes):
        # Reset the environment
        observation, info = env_test.reset()
        state = discretize_state(observation)
        
        done = False
        episode_length = 0
        
        # Run one episode
        while not done:
            # Always select the best action (no exploration)
            action = np.argmax(q_table[state])
            
            # Take action
            next_observation, reward, terminated, truncated, info = env_test.step(action)
            done = terminated or truncated
            
            # Update state and counter
            state = discretize_state(next_observation)
            episode_length += 1
            
        episode_lengths.append(episode_length)
        print(f"Test Episode {episode+1}/{n_episodes}, Length: {episode_length}")
    
    avg_length = sum(episode_lengths) / len(episode_lengths)
    print(f"Average episode length: {avg_length:.2f}")
    
    env_test.close()
    return avg_length

In [14]:
q_table, total_episode_states = train_agent(
    n_episodes=10000, 
    alpha=0.05, 
    gamma=0.99, 
    epsilon_start=1.0, 
    epsilon_end=0.01, 
    epsilon_decay=3000
)

Episode 0, Average reward: 0.34, Epsilon: 1.0000
Episode 50, Average reward: 23.48, Epsilon: 0.9836
Episode 100, Average reward: 24.34, Epsilon: 0.9675
Episode 150, Average reward: 20.62, Epsilon: 0.9517
Episode 200, Average reward: 22.16, Epsilon: 0.9362
Episode 250, Average reward: 21.28, Epsilon: 0.9208
Episode 300, Average reward: 24.1, Epsilon: 0.9058
Episode 350, Average reward: 26.38, Epsilon: 0.8910
Episode 400, Average reward: 21.94, Epsilon: 0.8764
Episode 450, Average reward: 23.24, Epsilon: 0.8621
Episode 500, Average reward: 22.2, Epsilon: 0.8480
Episode 550, Average reward: 21.42, Epsilon: 0.8342
Episode 600, Average reward: 25.04, Epsilon: 0.8205
Episode 650, Average reward: 24.74, Epsilon: 0.8071
Episode 700, Average reward: 26.52, Epsilon: 0.7940
Episode 750, Average reward: 24.16, Epsilon: 0.7810
Episode 800, Average reward: 24.46, Epsilon: 0.7683
Episode 850, Average reward: 30.36, Epsilon: 0.7557
Episode 900, Average reward: 25.2, Epsilon: 0.7434
Episode 950, Averag

In [15]:
test_agent(q_table)

Test Episode 1/10, Length: 65
Test Episode 2/10, Length: 58
Test Episode 3/10, Length: 15
Test Episode 4/10, Length: 63
Test Episode 5/10, Length: 111
Test Episode 6/10, Length: 22
Test Episode 7/10, Length: 17
Test Episode 8/10, Length: 22
Test Episode 9/10, Length: 11
Test Episode 10/10, Length: 22
Average episode length: 40.60


40.6