# Mountain Car Policy

In [10]:
import gym
import numpy as np

import time

In [11]:
env = gym.make("MountainCar-v0")
env.seed(0)
np.random.seed(0)
numberOfStates = 40
numberOfEpisodes = 10000
numberOfItrs = 10000
initLearningRate = 1.0
minLearningRate = 0.003
gamma = 1.0
epsilon = 0.02
qTable = np.zeros((numberOfStates, numberOfStates, 3))

[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m


In [12]:
def runEpisode(environment, policy = None, render = False):
    obs = env.reset()
    total_reward = 0
    step_index = 0
    for i in range(1000):
        if render:
            environment.render()
        if policy is None:
            action = environment.action_space.sample()
        else:
            a,b = observationState(environment, obs)
            action = policy[a][b]
        obs, reward, done, _ = env.step(action)
        total_reward += gamma ** step_index * reward
        step_index += 1
        if done:
            break
    return total_reward

In [13]:
def observationState(environment, obs):
    env_low = environment.observation_space.low
    env_high = environment.observation_space.high
    env_dx = (env_high - env_low) / numberOfStates
    a = int((obs[0] - env_low[0])/env_dx[0])
    b = int((obs[1] - env_low[1])/env_dx[1])
    return a, b

In [14]:
for i in range(numberOfEpisodes):
    obs = env.reset()
    total_reward = 0
    learningRate = max(minLearningRate, initLearningRate * (0.85 ** (i//100)))
    for j in range(numberOfItrs):
            a, b = observationState(env, obs)
            if np.random.uniform(0, 1) < epsilon:
                action = np.random.choice(env.action_space.n)
            else:
                x = qTable[a][b]
                y = np.exp(x)
                probs = y / np.sum(y)
                action = np.random.choice(env.action_space.n, p=probs)
            obs, reward, done, info = env.step(action)
            total_reward += reward
            #env.render()
            a_, b_ = observationState(env, obs)
            qTable[a][b][action] = qTable[a][b][action] + learningRate * (reward + gamma *  np.max(qTable[a_][b_]) - qTable[a][b][action])
            if done:
                break
                
    if i % 1000 == 0:
        print("At episode :", i+1, "Total Reward is :", total_reward)
policy = np.argmax(qTable, axis=2)
policyScores = [runEpisode(env, policy, False) for _ in range(10000)]
print("Average score for episodes = ", np.mean(policyScores))
runEpisode(env, policy, True)
env.close()

('At episode :', 1, 'Total Reward is :', -200.0)
('At episode :', 1001, 'Total Reward is :', -200.0)
('At episode :', 2001, 'Total Reward is :', -200.0)
('At episode :', 3001, 'Total Reward is :', -200.0)
('At episode :', 4001, 'Total Reward is :', -200.0)
('At episode :', 5001, 'Total Reward is :', -200.0)
('At episode :', 6001, 'Total Reward is :', -200.0)
('At episode :', 7001, 'Total Reward is :', -200.0)
('At episode :', 8001, 'Total Reward is :', -198.0)
('At episode :', 9001, 'Total Reward is :', -200.0)
('Average score for episodes = ', -129.54390000000001)
