In [2]:
import datetime
import time
import gymnasium as gym
import numpy as np

In [3]:
# Create the CartPole environment
env = gym.make('CartPole-v1')

In [7]:
# Define the discretization parameters for each observation
n_bins = [10, 10, 10, 10]  # Number of bins for each observation parameter
state_bins = [np.linspace(-4.8, 4.8, n_bins[0] - 1),  # Cart position
              np.linspace(-3.5, 3.5, n_bins[1] - 1),  # Cart velocity
              np.linspace(-0.418, 0.418, n_bins[2] - 1),  # Pole angle
              np.linspace(-3.5, 3.5, n_bins[3] - 1)]  # Pole angular velocity

In [8]:
# Initialize Q-table with zeros
Q = np.zeros(n_bins + [env.action_space.n])

# Define Q-learning parameters
alpha = 0.1  # Learning rate
gamma = 0.95  # Discount factor
epsilon = 1  # Exploration rate
epsilon_reduce = 0.99995

In [9]:
# Q-learning algorithm
print("start time equal with         "+time.strftime("%H:%M:%S"))
for episode in range(1000):
    states = env.reset()
    state = states[0]
    done = False

    while not done:
        # Discretize the current state
        state_discrete = tuple(np.digitize(state[i], state_bins[i]) for i in range(4))

        # Choose action using Îµ-greedy strategy
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Random action
        else:
            action = np.argmax(Q[state_discrete])

        # Take the chosen action and observe the next state and reward
        next_state, reward, done, xx , rr= env.step(action)

        # Discretize the next state
        next_state_discrete = tuple(np.digitize(next_state[i], state_bins[i]) for i in range(4))

        # Update Q-value for the current state-action pair
        Q[state_discrete + (action,)] += alpha * (reward + gamma * np.max(Q[next_state_discrete]) - Q[state_discrete + (action,)])

        # Transition to the next state
        state = next_state
    print(f"step {episode} has been ended....")
    epsilon *= epsilon_reduce

    # After training, you can use the learned Q-values to select actions in the environment
env.close()


print("end time equal with         "+time.strftime("%H:%M:%S"))

start time equal with         01:41:32
step 0 has been ended....
step 1 has been ended....
step 2 has been ended....
step 3 has been ended....
step 4 has been ended....
step 5 has been ended....
step 6 has been ended....
step 7 has been ended....
step 8 has been ended....
step 9 has been ended....
step 10 has been ended....
step 11 has been ended....
step 12 has been ended....
step 13 has been ended....
step 14 has been ended....
step 15 has been ended....
step 16 has been ended....
step 17 has been ended....
step 18 has been ended....
step 19 has been ended....
step 20 has been ended....
step 21 has been ended....
step 22 has been ended....
step 23 has been ended....
step 24 has been ended....
step 25 has been ended....
step 26 has been ended....
step 27 has been ended....
step 28 has been ended....
step 29 has been ended....
step 30 has been ended....
step 31 has been ended....
step 32 has been ended....
step 33 has been ended....
step 34 has been ended....
step 35 has been ended....

In [10]:
# show agent
ss=gym.make('CartPole-v1',render_mode="human")
states=ss.reset()
state=states[0]
done = False
while not done:
    state_discrete = tuple(np.digitize(state[i], state_bins[i]) for i in range(4))
    action = np.argmax(Q[state_discrete])
    next_state, reward, done, xx, rr = ss.step(action)
    next_state_discrete = tuple(np.digitize(next_state[i], state_bins[i]) for i in range(4))
    state = next_state

time.sleep(2)
env.close()

: 