In [1]:
import gym
import numpy as np
import time

In [17]:
num_states = 30
num_episodes = 150

gamma = 0.99
epsilon = 0.10
initial_lr = 1.0
min_lr = 0.001

In [18]:
env_name = 'MountainCar-v0'
env = gym.make(env_name)
env = env.unwrapped
env = gym.wrappers.Monitor(env,'MountainCar-v0-Q-learning',force=True)
env.seed(0)
np.random.seed(0)

In [19]:
def discrete_state(env, obs):
    env_low = env.observation_space.low
    env_high = env.observation_space.high

    env_den = (env_high - env_low) / num_states

    pos_den = env_den[0]
    vel_den = env_den[1]

    pos_low = env_low[0]
    vel_low = env_low[1]

    pos_scaled = int((obs[0] - pos_low) / pos_den)
    vel_scaled = int((obs[1] - vel_low) / vel_den)
    
    return pos_scaled, vel_scaled

In [20]:
q_table = np.zeros((num_states, num_states, env.action_space.n))
total_steps = 0

for episode in range(num_episodes):
    obs = env.reset()
    steps = 0
    alpha = max(min_lr, initial_lr * (0.8 ** (episode // 5)))
    print('----------')
    print("alpha = " + str(alpha))
    
    while True:
        env.render()
        pos, vel = discrete_state(env, obs)
        
        if np.random.uniform(low=0, high=1) < epsilon:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(q_table[pos][vel])

        obs, reward, terminate, _ = env.step(action)
        reward = -10 + 3 * abs(obs[0] + 0.5) + 2 * max(obs[0]-0.2,0) + 0.08 * obs[1]

        pos_, vel_ = discrete_state(env, obs)
        q_table[pos][vel][action] = (1 - alpha) * q_table[pos][vel][action] + alpha * (
                reward + gamma * np.max(q_table[pos_][vel_]))
        steps += 1
        
        if terminate:
            break
    
    print("Episode {} completed in {} steps".format(episode + 1, steps))

start = time.time()
while True:
    env.render()
    if (time.time() - start) >= 5:
        break

----------
alpha = 1.0
Episode 1 completed with total reward 0 in 8218 steps
----------
alpha = 1.0
Episode 2 completed with total reward 0 in 3513 steps
----------
alpha = 1.0
Episode 3 completed with total reward 0 in 1259 steps
----------
alpha = 1.0
Episode 4 completed with total reward 0 in 4773 steps
----------
alpha = 1.0
Episode 5 completed with total reward 0 in 2647 steps
----------
alpha = 0.8
Episode 6 completed with total reward 0 in 4681 steps
----------
alpha = 0.8
Episode 7 completed with total reward 0 in 4769 steps
----------
alpha = 0.8
Episode 8 completed with total reward 0 in 1330 steps
----------
alpha = 0.8
Episode 9 completed with total reward 0 in 904 steps
----------
alpha = 0.8
Episode 10 completed with total reward 0 in 569 steps
----------
alpha = 0.6400000000000001
Episode 11 completed with total reward 0 in 705 steps
----------
alpha = 0.6400000000000001
Episode 12 completed with total reward 0 in 889 steps
----------
alpha = 0.6400000000000001
Episode 1

Episode 91 completed with total reward 0 in 235 steps
----------
alpha = 0.018014398509482003
Episode 92 completed with total reward 0 in 194 steps
----------
alpha = 0.018014398509482003
Episode 93 completed with total reward 0 in 281 steps
----------
alpha = 0.018014398509482003
Episode 94 completed with total reward 0 in 238 steps
----------
alpha = 0.018014398509482003
Episode 95 completed with total reward 0 in 331 steps
----------
alpha = 0.014411518807585602
Episode 96 completed with total reward 0 in 242 steps
----------
alpha = 0.014411518807585602
Episode 97 completed with total reward 0 in 229 steps
----------
alpha = 0.014411518807585602
Episode 98 completed with total reward 0 in 246 steps
----------
alpha = 0.014411518807585602
Episode 99 completed with total reward 0 in 232 steps
----------
alpha = 0.014411518807585602
Episode 100 completed with total reward 0 in 234 steps
----------
alpha = 0.011529215046068483
Episode 101 completed with total reward 0 in 197 steps
----