@author Gediyon M. Girma

TD(n) n-step SARSA on-policy method for MountainCar-V0 environment

In [1]:
import gym
import time
import itertools
import random
import numpy as np
env = gym.make('MountainCar-v0')

  deprecation(
  deprecation(


In [2]:
bins = (30, 30)

def discretize_observation(observation):
  """
  Discretizes the observation space.

  Args:
    observation: The observation to discretize.

  Returns:
    The discretized observation.
  """


  low = env.observation_space.low
  high = env.observation_space.high

  return tuple(np.digitize(observation[i], np.linspace(low[i], high[i], bins[i] + 1)) - 1 for i in range(2))

In [3]:
# TD(n) n-step SARSA on-policy using epsilon-greedy policy

alpha = 0.1 #step size for incremental averaging
gamma = 1 #discount factor
n = 2 # number of steps
epsilon = 0.1

# formaulate the state space with every combination of the discritsized elements of the states
states = itertools.product(np.arange(bins[0]), np.arange(bins[1]))

policy = {}
q = {}



for state in states:
  q[state] = np.zeros(env.action_space.n) # initialize the action-value function
  policy[state] = np.full(((env.action_space.n)),epsilon/env.action_space.n) # initialize the policy
  best_action = np.argmax(q[state])
  policy[state][best_action] += 1 - epsilon # initialize a random policy





In [4]:

episodes = 5e4
start_timer = time.time()
episode = 1

while episode < episodes:
  episode_tracker = []

  # reset the environment
  obs = env.reset()
  state = discretize_observation(obs) # discretize the observation
  action = np.random.choice(np.arange(env.action_space.n), p = policy[state]) # select an action

  episode_tracker.append([0,state,action])
  T = float('inf') # terminal state

  t = 0 #initialize the time-step counter t

  while True:


    if t<T:

      obs, reward, done, info = env.step(action) # taking the action
      next_state = discretize_observation(obs) # next state

      episode_tracker.append([reward, next_state])

      if done:
        T = t+1
      else:
        action = np.random.choice(np.arange(env.action_space.n), p = policy[state]) # select an action
        episode_tracker[-1].append(action)

    tau = t - n + 1 # the time step whose state estimate is being updated (going n steps back)

    if tau >= 0:
      # compute the return
      
      G = sum([(gamma**(i-tau-1))*episode_tracker[i][0] for i in range(tau + 1, min(tau + n, T)+1)]) 


      if tau + n < T:
        G += (gamma**n)*q[episode_tracker[tau + n][1]][episode_tracker[tau + n][2]] # compute the discounted return
        
      s = episode_tracker[tau][1]
      a = episode_tracker[tau][2]
      q[s][a] += alpha * (G - q[s][a]) # update the action-value function

      policy[s] = np.full(((env.action_space.n)),epsilon/env.action_space.n) # update the policy
      best_action = np.argmax(q[s])
      policy[s][best_action] += 1-epsilon

    if tau == T-1:  # when the update time reaches terminal state, break
      break

    state = next_state  # update the state

    t += 1 # updating the time- step

  episode += 1
  if episode % 10000 == 0:
    end_timer = time.time()
    timer = end_timer - start_timer
    elapsed_time_struct = time.gmtime(timer)
    formatted_time = time.strftime("%H:%M:%S", elapsed_time_struct)
    print("Episode: ",episode, " time: ", formatted_time)

  if episode == episodes:
    print("done!")


  if not isinstance(terminated, (bool, np.bool8)):


Episode:  10000  time:  00:06:17
Episode:  20000  time:  00:11:58
Episode:  30000  time:  00:17:32
Episode:  40000  time:  00:22:56
Episode:  50000  time:  00:28:26
done!


In [5]:
#test
steps_to_solution = []

for j in range(100):
  obs = env.reset()
  state = discretize_observation(obs)
  for i in range(1,50000):
    action = np.argmax(policy[state])
    # print("step",i, "reward: ", reward," state: ", state," action=",action)
    obs, reward, done, info = env.step(action)
    state = discretize_observation(obs)

    if done:
      #print ("done")
      steps_to_solution.append(i)
      break
print(steps_to_solution)
avg_step = np.mean(steps_to_solution)
print("Average steps to solution per 100 episodes: ",avg_step)

# ipythondisplay.clear_output(wait=True)
env.close()

[148, 200, 165, 186, 182, 162, 166, 179, 200, 200, 157, 165, 189, 161, 200, 157, 169, 159, 200, 200, 175, 172, 200, 158, 166, 167, 162, 200, 180, 200, 200, 170, 158, 158, 161, 200, 165, 163, 156, 162, 200, 168, 200, 166, 200, 157, 168, 166, 161, 200, 163, 176, 177, 185, 200, 170, 164, 200, 163, 164, 184, 176, 162, 169, 162, 162, 200, 165, 187, 162, 170, 200, 163, 169, 200, 159, 162, 162, 166, 161, 168, 200, 172, 200, 200, 166, 163, 200, 200, 148, 166, 200, 161, 161, 155, 200, 172, 176, 158, 173]
Average steps to solution per 100 episodes:  175.46
