In [1]:
import gymnasium as gym
import numpy as np

is_training = True
render = False

episodes = 5000
learning_rate_a = 0.9
discount_factor_g = 0.9

epsilon = 1
epsilon_decay_rate = 2/episodes
rng = np.random.default_rng()

env = gym.make("MountainCar-v0", render_mode='human' if render else None)
# Break POS and Velocity into 20 segments
pos_space = np.linspace(env.observation_space.low[0], env.observation_space.high[0], 20) 
vel_space = np.linspace(env.observation_space.low[1], env.observation_space.high[1], 20)

q = np.zeros((len(pos_space), len(vel_space), env.action_space.n))

for i in range(episodes):
    batch_best = -1001

    state = env.reset()[0]
    # find segment of agents POS and Velocity
    state_p = np.digitize(state[0], pos_space)
    state_v = np.digitize(state[1], vel_space)

    terminated = False
    rewards = 0

    while(not terminated and rewards > -1000):
        if is_training and rng.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q[state_p, state_v, :])

        new_state,reward,terminated,_,_ = env.step(action)
        new_state_p = np.digitize(state[0], pos_space)
        new_state_v = np.digitize(state[1], vel_space)

        if is_training:
            q[state_p, state_v, action] = q[state_p, state_v, action]+learning_rate_a*(
                reward+discount_factor_g*np.max(q[new_state_p,new_state_v,:])-q[state_p, state_v, action]
            )

        state = new_state
        state_p = new_state_p
        state_v = new_state_v

        rewards += reward

    epsilon = max(epsilon - epsilon_decay_rate, 0)
    
    batch_best = max(batch_best, rewards)
    if i % 100 == 0:
        print(f'Episode: {i+1} - Best of the batch: {batch_best}')
        batch_best = -1001
env.close()



Episode: 0 - Best of the batch: -1000.0
Episode: 100 - Best of the batch: -1000.0
Episode: 200 - Best of the batch: -1000.0
Episode: 300 - Best of the batch: -1000.0
Episode: 400 - Best of the batch: -1000.0
Episode: 500 - Best of the batch: -1000.0
Episode: 600 - Best of the batch: -1000.0
Episode: 700 - Best of the batch: -1000.0
Episode: 800 - Best of the batch: -1000.0
Episode: 900 - Best of the batch: -866.0
Episode: 1000 - Best of the batch: -1000.0
Episode: 1100 - Best of the batch: -1000.0
Episode: 1200 - Best of the batch: -1000.0
Episode: 1300 - Best of the batch: -1000.0
Episode: 1400 - Best of the batch: -1000.0
Episode: 1500 - Best of the batch: -1000.0
Episode: 1600 - Best of the batch: -1000.0
Episode: 1700 - Best of the batch: -1000.0
Episode: 1800 - Best of the batch: -1000.0
Episode: 1900 - Best of the batch: -1000.0
Episode: 2000 - Best of the batch: -1000.0
Episode: 2100 - Best of the batch: -1000.0
Episode: 2200 - Best of the batch: -765.0
Episode: 2300 - Best of t

In [2]:
env = gym.make("MountainCar-v0", render_mode='human')

# Break POS and Velocity into 20 segments
pos_space = np.linspace(env.observation_space.low[0], env.observation_space.high[0], 20) 
vel_space = np.linspace(env.observation_space.low[1], env.observation_space.high[1], 20)

for i in range(3):

    state = env.reset()[0]
    # find segment of agents POS and Velocity
    state_p = np.digitize(state[0], pos_space)
    state_v = np.digitize(state[1], vel_space)

    terminated = False
    rewards = 0

    while(not terminated and rewards > -1000):
        action = np.argmax(q[state_p, state_v, :])

        new_state,reward,terminated,_,_ = env.step(action)
        new_state_p = np.digitize(state[0], pos_space)
        new_state_v = np.digitize(state[1], vel_space)

        state = new_state
        state_p = new_state_p
        state_v = new_state_v

        rewards += reward

    epsilon = max(epsilon - epsilon_decay_rate, 0)

    print(f'Episode: {i+1} - Steps: {rewards*-1}')

env.close()

Episode: 0 - Steps: 567.0
Episode: 1 - Steps: 328.0
Episode: 2 - Steps: 425.0
Episode: 3 - Steps: 703.0
Episode: 4 - Steps: 529.0
Episode: 5 - Steps: 373.0
Episode: 6 - Steps: 384.0
Episode: 7 - Steps: 353.0
Episode: 8 - Steps: 631.0
Episode: 9 - Steps: 434.0
