In [13]:
import numpy as np
from MountainCarEnv import MountainCarEnv

In [14]:
env = MountainCarEnv(render_mode="human")

In [15]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
        print('explore')
    # exploit
    else:
        action = np.argmax(Q[state])
        print('exploit')
    return action

In [16]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [17]:
pos_space = np.linspace(-5, 5, 10)
vel_space = np.linspace(-3, 3, 2)
pos_space

array([-5.        , -3.88888889, -2.77777778, -1.66666667, -0.55555556,
        0.55555556,  1.66666667,  2.77777778,  3.88888889,  5.        ])

In [18]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [19]:
state = get_state(np.array([-0.4, 0.2]))
state

(5, 1)

In [20]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

In [21]:
Q = np.zeros((11,3,3))
Q

array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])

In [22]:
obs = env.reset()
print(obs)
done = False
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    print('->', state, action, reward, obs, done)

[-0.59127665  0.        ]
exploit
-> (4, 1) 0 -1.0 [-5.917725e-01 -4.958963e-04] False
explore
-> (4, 1) 2 -1.0 [-0.59076065  0.00101185] False
exploit
-> (4, 1) 0 -1.0 [-5.902485e-01  5.121626e-04] False
explore
-> (4, 1) 2 -1.0 [-0.5882398   0.00200871] False
exploit
-> (4, 1) 0 -1.0 [-0.5867493   0.00149049] False
explore


-> (4, 1) 1 -1.0 [-0.584788    0.00196129] False
explore
-> (4, 1) 2 -1.0 [-0.5813704   0.00341763] False
explore
-> (4, 1) 1 -1.0 [-0.5775216   0.00384875] False
explore
-> (4, 1) 2 -1.0 [-0.5722702   0.00525141] False
exploit
-> (4, 1) 0 -1.0 [-0.5676551   0.00461515] False
exploit
-> (4, 1) 0 -1.0 [-0.56371045  0.00394462] False
exploit
-> (4, 1) 0 -1.0 [-0.5604657   0.00324473] False
exploit
-> (4, 1) 0 -1.0 [-0.5579451   0.00252067] False
exploit
-> (4, 1) 0 -1.0 [-0.55616724  0.00177781] False
explore
-> (4, 1) 0 -1.0 [-0.55514556  0.00102168] False
explore
-> (5, 1) 1 -1.0 [-0.5538876   0.00125793] False
exploit
-> (5, 1) 0 -1.0 [-5.5340284e-01  4.8478460e-04] False
exploit
-> (5, 1) 0 -1.0 [-5.5369484e-01 -2.9198258e-04] False
explore
-> (5, 1) 2 -1.0 [-0.5527614   0.00093343] False
explore
-> (5, 1) 1 -1.0 [-0.5516095   0.00115187] False
explore
-> (5, 1) 0 -1.0 [-5.5124784e-01  3.6170558e-04] False
explore
-> (5, 1) 0 -1.0 [-5.5167902e-01 -4.3116428e-04] False
explore
-> (5, 

KeyboardInterrupt: 