In [1]:
import gym
import numpy as np

In [68]:
# hyper parameters, change these for possibly better results 
n_states = 40
max_iterations = 10000
lr_init = 1.0
lr_min = 0.003
gamma = 1
t_max = 10000
epsilon = 0.02

env_name = 'MountainCar-v0'


In [65]:
def run_episode(env, policy=None, render=False):
    obs = env.reset()
    reward_total = 0
    step_index = 0
    for _ in range(t_max):
        if render:
            env.render()
        if policy is None:
            action = env.action_space.sample()
        else:
            a, b = obs_to_state(env, obs)
            action = policy[a][b]
        obs, reward, done, _ = env.step(action)
        reward_total += gamma ** step_index * reward
        step_index += 1
        if done:
            break
    return reward_total


In [66]:
# R function, where the algorithm learns from its observations
def obs_to_state(env, obs):
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0])/env_dx[0])
    b = int((obs[1] - env_low[1])/env_dx[1])
    return a, b


In [69]:
if __name__ == '__main__':
    env = gym.make(env_name)
    q_table = np.zeros((n_states, n_states, 3))
    for i in range(max_iterations):
        obs = env.reset()
        reward_total = 0
        # adjust learning rate
        eta = max(lr_min, lr_init * (0.85 ** (i//100)))
        for j in range(t_max):
            a, b = obs_to_state(env, obs)
            #take a random step sometimes so we don't get stuck on local min
            if np.random.uniform(0, 1) < epsilon:
                action = np.random.choice(env.action_space.n)
            else:
                # create a proper probability function out of the learned table and use it to take the proper action 
                logits = q_table[a][b]
                logits_exp = np.exp(logits)
                prob = logits_exp/np.sum(logits_exp)
                action = np.random.choice(env.action_space.n, p=prob)
                
            obs, reward, done, _ = env.step(action)
            reward_total += reward
            a_, b_ = obs_to_state(env, obs)
            # bellman equation
            # update the q table with what we learned from last action and predicted values
            q_table[a][b][action] = q_table[a][b][action]\
                                                + eta * (reward + gamma * np.max(q_table[a_][b_])
                                                         - q_table[a][b][action])
            if done:
                break
        if i % 100 == 0:
            print('Iteration #%d -- Total reward %d.' % (i+1, reward_total))
    
    # select best learned actions to use on final try        
    solution_policy = np.argmax(q_table, axis=2)
    solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
    print("Average score of solution = ", np.mean(solution_policy_scores))
    
    run_episode(env, solution_policy, True)

[2018-03-02 01:36:00,841] Making new env: MountainCar-v0


Iteration #1 -- Total reward -200.


Iteration #101 -- Total reward -200.


Iteration #201 -- Total reward -200.


Iteration #301 -- Total reward -200.


Iteration #401 -- Total reward -200.


Iteration #501 -- Total reward -200.


Iteration #601 -- Total reward -200.


Iteration #701 -- Total reward -200.


Iteration #801 -- Total reward -200.


Iteration #901 -- Total reward -200.


Iteration #1001 -- Total reward -200.


Iteration #1101 -- Total reward -200.


Iteration #1201 -- Total reward -200.


Iteration #1301 -- Total reward -200.


Iteration #1401 -- Total reward -200.


Iteration #1501 -- Total reward -200.


Iteration #1601 -- Total reward -200.


Iteration #1701 -- Total reward -200.


Iteration #1801 -- Total reward -200.


Iteration #1901 -- Total reward -200.


Iteration #2001 -- Total reward -200.


Iteration #2101 -- Total reward -200.


Iteration #2201 -- Total reward -200.


Iteration #2301 -- Total reward -200.


Iteration #2401 -- Total reward -200.


Iteration #2501 -- Total reward -200.


Iteration #2601 -- Total reward -200.


Iteration #2701 -- Total reward -200.


Iteration #2801 -- Total reward -200.


Iteration #2901 -- Total reward -200.


Iteration #3001 -- Total reward -200.


Iteration #3101 -- Total reward -200.


Iteration #3201 -- Total reward -200.


Iteration #3301 -- Total reward -200.


Iteration #3401 -- Total reward -200.


Iteration #3501 -- Total reward -200.


Iteration #3601 -- Total reward -200.


Iteration #3701 -- Total reward -200.


Iteration #3801 -- Total reward -200.


Iteration #3901 -- Total reward -200.


Iteration #4001 -- Total reward -200.


Iteration #4101 -- Total reward -200.


Iteration #4201 -- Total reward -200.


Iteration #4301 -- Total reward -200.


Iteration #4401 -- Total reward -200.


Iteration #4501 -- Total reward -200.


Iteration #4601 -- Total reward -200.


Iteration #4701 -- Total reward -200.


Iteration #4801 -- Total reward -200.


Iteration #4901 -- Total reward -200.


Iteration #5001 -- Total reward -200.


Iteration #5101 -- Total reward -200.


Iteration #5201 -- Total reward -200.


Iteration #5301 -- Total reward -200.


Iteration #5401 -- Total reward -200.


Iteration #5501 -- Total reward -200.


Iteration #5601 -- Total reward -200.


Iteration #5701 -- Total reward -200.


Iteration #5801 -- Total reward -200.


Iteration #5901 -- Total reward -200.


Iteration #6001 -- Total reward -200.


Iteration #6101 -- Total reward -200.


Iteration #6201 -- Total reward -200.


Iteration #6301 -- Total reward -200.


Iteration #6401 -- Total reward -200.


Iteration #6501 -- Total reward -200.


Iteration #6601 -- Total reward -200.


Iteration #6701 -- Total reward -200.


Iteration #6801 -- Total reward -200.


Iteration #6901 -- Total reward -200.


Iteration #7001 -- Total reward -200.


Iteration #7101 -- Total reward -200.


Iteration #7201 -- Total reward -200.


Iteration #7301 -- Total reward -200.


Iteration #7401 -- Total reward -200.


Iteration #7501 -- Total reward -200.


Iteration #7601 -- Total reward -200.


Iteration #7701 -- Total reward -177.


Iteration #7801 -- Total reward -200.


Iteration #7901 -- Total reward -200.


Iteration #8001 -- Total reward -200.


Iteration #8101 -- Total reward -200.


Iteration #8201 -- Total reward -200.


Iteration #8301 -- Total reward -200.


Iteration #8401 -- Total reward -200.


Iteration #8501 -- Total reward -200.


Iteration #8601 -- Total reward -200.


Iteration #8701 -- Total reward -200.


Iteration #8801 -- Total reward -200.


Iteration #8901 -- Total reward -200.


Iteration #9001 -- Total reward -200.


Iteration #9101 -- Total reward -200.


Iteration #9201 -- Total reward -200.


Iteration #9301 -- Total reward -200.


Iteration #9401 -- Total reward -200.


Iteration #9501 -- Total reward -200.


Iteration #9601 -- Total reward -200.


Iteration #9701 -- Total reward -200.


Iteration #9801 -- Total reward -200.


Iteration #9901 -- Total reward -200.


Average score of solution =  -118.22


In [42]:
run_episode(env, solution_policy, True)

-161.0