In [None]:
import gym
import numpy as np

In [68]:
# hyper parameters, change these for possibly better results 
n_states = 40
max_iterations = 10000
lr_init = 1.0
lr_min = 0.003
gamma = 1
t_max = 10000
epsilon = 0.02

env_name = 'MountainCar-v0'


In [65]:
def run_episode(env, policy=None, render=False):
    obs = env.reset()
    reward_total = 0
    step_index = 0
    for _ in range(t_max):
        if render:
            env.render()
        if policy is None:
            action = env.action_space.sample()
        else:
            a, b = obs_to_state(env, obs)
            action = policy[a][b]
        obs, reward, done, _ = env.step(action)
        reward_total += gamma ** step_index * reward
        step_index += 1
        if done:
            break
    return reward_total


In [66]:
# R function, where the algorithm learns from its observations
def obs_to_state(env, obs):
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    a = int((obs[0] - env_low[0])/env_dx[0])
    b = int((obs[1] - env_low[1])/env_dx[1])
    return a, b


In [None]:
if __name__ == '__main__':
    env = gym.make(env_name)
    q_table = np.zeros((n_states, n_states, 3))
    for i in range(max_iterations):
        obs = env.reset()
        reward_total = 0
        # adjust learning rate
        eta = max(lr_min, lr_init * (0.85 ** (i//100)))
        for j in range(t_max):
            a, b = obs_to_state(env, obs)
            #take a random step sometimes so we don't get stuck on local min
            if np.random.uniform(0, 1) < epsilon:
                action = np.random.choice(env.action_space.n)
            else:
                # create a proper probability function out of the learned table and use it to take the proper action 
                logits = q_table[a][b]
                logits_exp = np.exp(logits)
                prob = logits_exp/np.sum(logits_exp)
                action = np.random.choice(env.action_space.n, p=prob)
                
            obs, reward, done, _ = env.step(action)
            reward_total += reward
            a_, b_ = obs_to_state(env, obs)
            # bellman equation
            # update the q table with what we learned from last action and predicted values
            q_table[a][b][action] = q_table[a][b][action]\
                                                + eta * (reward + gamma * np.max(q_table[a_][b_])
                                                         - q_table[a][b][action])
            if done:
                break
        if i % 100 == 0:
            print('Iteration #%d -- Total reward %d.' % (i+1, reward_total))
    
    # select best learned actions to use on final try        
    solution_policy = np.argmax(q_table, axis=2)
    solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
    print("Average score of solution = ", np.mean(solution_policy_scores))
    
    run_episode(env, solution_policy, True)

