@author: Gediyon M. Girma

MC controller with exploratory start for MountainCar-V0 environment.

prblem: Mountain Car

*   reward: The goal is to reach the flag placed on top of the right hill as quickly as possible, as such the agent is penalized with a reward of -1 for each timestep.
*   observation: ndarray of size (2,) where index 0 is position in range of  [-1.2, 0.6]

In [3]:
import gym
import time
import itertools
import random
import numpy as np

In [None]:
# loading the environment
env = gym.make('MountainCar-v0')

In [5]:
# discretization of the observation space

bins = (30, 30)

def discretize_observation(observation):
  """
  Discretizes the observation space.

  Args:
    observation: The observation to discretize.

  Returns:
    The discretized observation.
  """

  low = env.observation_space.low
  high = env.observation_space.high

  return tuple(np.digitize(observation[i], np.linspace(low[i], high[i], bins[i] + 1)) - 1 for i in range(2))


In [17]:
# initializing
gamma = 1 # discount factor
alpha = 0.1 # learning rate
epsilon = 0.1
q = {} # action value function
policy = {} # target policy
counter = {}

# formaulate the state space with every combination of the discritsized elements of the states
states = itertools.product(np.arange(bins[0]), np.arange(bins[1]))


for state in states:
  q[state] = np.zeros((env.action_space.n)) # initialize the action-value function as zero

  policy[state] = np.zeros((env.action_space.n)) # initialize the policy
  policy[state][np.random.randint(3)] += 1

  counter[state] = np.zeros((env.action_space.n))


In [18]:
# on-policy MC with exploring start
# Iteration

start_timer = time.time() # starting the timer

episodes = 5e4
episode = 1

while episode < episodes:
  # reset the environment
  obs = env.reset()
  state = discretize_observation(obs) # discretize the observation

  # Exploratory start
  obs = [np.random.uniform(env.observation_space.low[i], env.observation_space.high[i]) for i in range(2)]
  state = discretize_observation(obs) # discretize the observation
  env.state = obs # set the state of the environment to exploratory start
  action = env.action_space.sample() # sample an action from the action space


  # initializing a dictionary to save the episode's trajectory
  episode_trajectory = [[0, state, action]] # starting reward (0) and state-action pair

  # generating an episodes using a an epsilon greedy policy
  while True:

    obs, reward, done, info = env.step(action)
    state = discretize_observation(obs) # discretize the observation

    if done:
      episode_trajectory.append([reward, state, None])
      break

    # selecting the action based on the target policy
    action = np.random.choice(np.arange(env.action_space.n), p = policy[state])


    # appending the action to the episode trajectory
    episode_trajectory.append([reward, state, action])


  # initializing the return and the weight
  G = 0

  # updating the action-value function
  for t in reversed(range(len(episode_trajectory) - 1)):

    G = gamma*G  + episode_trajectory[t+1][0]

    state = episode_trajectory[t][1]
    action = episode_trajectory[t][2]

    # returns[state][action].append(G)

    counter[state][action] += 1

    q[state][action] += (G-q[state][action])/(counter[state][action]) # incremental averaging

    # updating the policy
    best_action = np.argmax(q[state]) # selecting the best action based on the action-value function
    policy[state] = np.full(((env.action_space.n)),epsilon/env.action_space.n) # resetting the policy for the given state
    policy[state][best_action] += 1 - epsilon # setting the probability of the best action to 1


  episode += 1
  if episode % 10000 == 0:
    end_timer = time.time()
    timer = end_timer - start_timer
    elapsed_time_struct = time.gmtime(timer)
    formatted_time = time.strftime("%H:%M:%S", elapsed_time_struct)
    print("Episode: ",episode, " time: ", formatted_time)

  if episode == episodes:
    print("done!")



Episode:  10000  time:  00:06:00
Episode:  20000  time:  00:11:46
Episode:  30000  time:  00:17:31
Episode:  40000  time:  00:23:00
Episode:  50000  time:  00:28:33
done!


In [19]:
#test
steps_to_solution = []

for j in range(100):
  obs = env.reset()
  state = discretize_observation(obs)
  for i in range(1,50000):
    action = np.random.choice(np.arange(env.action_space.n), p = policy[state])
    # print("step",i, "reward: ", reward," state: ", state," action=",action)
    obs, reward, done, info = env.step(action)
    state = discretize_observation(obs)

    if done:
      #print ("done")
      steps_to_solution.append(i)
      break
print(steps_to_solution)
avg_step = np.mean(steps_to_solution)
print("Average steps to solution: ",avg_step)

# ipythondisplay.clear_output(wait=True)
env.close()

[200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200]
Average steps to solution:  200.0
