In [1]:
import gymnasium as gym
import numpy as np

In [2]:
# Initialize the Gym environment
env = gym.make("CartPole-v1")

# Set up the Q-table
num_features = env.observation_space.shape[0]
state_space = [30] * num_features
q_table = np.zeros(state_space + [env.action_space.n])

# Define hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995 
min_epsilon = 0.01

In [3]:
# Discretize the state space
def discretize_state(state):
    bins = [np.linspace(-4.8, 4.8, state_space[0] - 1),
            np.linspace(-4, 4, state_space[1] - 1),
            np.linspace(-0.418, 0.418, state_space[2] - 1),
            np.linspace(-4, 4, state_space[3] - 1)]
    return tuple(np.digitize(state[i], bins[i]) for i in range(len(state)))

In [4]:
# Training the Q-learning agent
num_episodes = 10000
for episode in range(num_episodes):
    # Discretise state
    state = discretize_state(env.reset()[0])
    done = trunc = False
    
    while not done and not trunc:
        # Using epsilon-greedy action selection
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])
        
        next_state, reward, done, trunc, _ = env.step(action)
        next_state = discretize_state(next_state)
        
        # Penalise stopping
        if done and reward == 0:
            reward = -100
        
        # Update q table
        q_table[state][action] = q_table[state][action] + alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state][action])
        state = next_state

    # Decay epsilon
    if epsilon > min_epsilon:
        epsilon *= epsilon_decay

In [5]:
# Evaluate the agent
total_rewards = 0
for episode in range(100):
    state = discretize_state(env.reset()[0])
    done = False
    while not done:
        action = np.argmax(q_table[state])
        next_state, reward, done, _, _ = env.step(action)
        state = discretize_state(next_state)
        total_rewards += reward

print(f"Average reward over 100 episodes: {total_rewards / 100}")

Average reward over 100 episodes: 151.74
