@author Gediyon M. Girma

TD(n) or n-step SARSA controllers of TD(2), TD(3), and TD(4) and compare thier performances for MountainCar environment.

In [1]:
import gym
import time
import itertools
import random
import numpy as np
env = gym.make('MountainCar-v0')

  deprecation(
  deprecation(


In [5]:
bins = (30, 30)

def discretize_observation(observation):
  """
  Discretizes the observation space.

  Args:
    observation: The observation to discretize.

  Returns:
    The discretized observation.
  """


  low = env.observation_space.low
  high = env.observation_space.high

  return tuple(np.digitize(observation[i], np.linspace(low[i], high[i], bins[i] + 1)) - 1 for i in range(2))

In [6]:
# TD(n) n-step SARSA on-policy using epsilon-greedy policy
def TD_n(n):
  alpha = 0.1 #step size for incremental averaging
  gamma = 1 #discount factor
  epsilon = 0.1

  # formaulate the state space with every combination of the discritsized elements of the states
  states = itertools.product(np.arange(bins[0]), np.arange(bins[1]))

  policy = {}
  q = {}



  for state in states:
    q[state] = np.zeros(env.action_space.n) # initialize the action-value function
    policy[state] = np.full(((env.action_space.n)),epsilon/env.action_space.n) # initialize the policy
    policy[state][np.random.randint(3)] += 1 - epsilon # initialize a random policy



  print("\n")
  print("TD(",n,") n-SARSA on-policy training for MountainCar-v0")
  episodes = 5e4
  start_timer = time.time()
  episode = 1

  while episode < episodes:
    episode_tracker = []

    # reset the environment
    obs = env.reset()
    state = discretize_observation(obs) # discretize the observation
    action = np.random.choice(np.arange(env.action_space.n), p = policy[state]) # select an action

    episode_tracker.append([0,state,action])
    T = float('inf') # terminal state

    t = 0 #initialize the time-step counter t

    while True:
      if t<T:

        obs, reward, done, info = env.step(action) # taking the action
        next_state = discretize_observation(obs) # next state

        episode_tracker.append([reward, next_state])

        if done:
          T = t+1
        else:
          action = np.random.choice(np.arange(env.action_space.n), p = policy[state]) # select an action
          episode_tracker[-1].append(action)

      tau = t - n + 1 # the time step whose state estimate is being updated (going n steps back)

      if tau >= 0:
        # compute the return
        G = sum([(gamma**(i-tau-1))*episode_tracker[i][0] for i in range(tau + 1, min(tau + n, T))]) 


        if tau + n < T:
          # compute the discounted return
          G += (gamma**n)*q[episode_tracker[tau + n][1]][episode_tracker[tau + n][2]] 
          
        s = episode_tracker[tau][1] # current state for update
        a = episode_tracker[tau][2] # current action for update

        q[s][a] += alpha * (G - q[s][a]) # update the action-value function

        policy[s] = np.full(((env.action_space.n)),epsilon/env.action_space.n) # update the policy
        best_action = np.argmax(q[s])
        policy[s][best_action] += 1-epsilon

      if tau == T-1:  # when the update time reaches terminal state, break
        break

      state = next_state  # update the state

      t += 1 # updating the time- step

    episode += 1
    if episode % 10000 == 0:
      end_timer = time.time()
      timer = end_timer - start_timer
      elapsed_time_struct = time.gmtime(timer)
      formatted_time = time.strftime("%H:%M:%S", elapsed_time_struct)
      print("Episode: ",episode, " time: ", formatted_time)

    if episode == episodes:
      print("done!")


  #test
  steps_to_solution = []

  for j in range(100):
    obs = env.reset()
    state = discretize_observation(obs)
    for i in range(1,50000):
      action = np.argmax(policy[state])
      # print("step",i, "reward: ", reward," state: ", state," action=",action)
      obs, reward, done, info = env.step(action)
      state = discretize_observation(obs)

      if done:
        #print ("done")
        steps_to_solution.append(i)
        break
  print(steps_to_solution)
  avg_step = np.mean(steps_to_solution)
  print("Average steps to solution per 100 episodes: ",avg_step)

  # ipythondisplay.clear_output(wait=True)
  env.close()
  return

In [None]:
TD_n(2)

  and should_run_async(code)
  if not isinstance(terminated, (bool, np.bool8)):




TD( 2 ) n-SARSA on-policy training for MountainCar-v0
Episode:  10000  time:  00:06:15
Episode:  20000  time:  00:11:59
Episode:  30000  time:  00:17:33
Episode:  40000  time:  00:23:00
Episode:  50000  time:  00:28:20
done!
[146, 186, 137, 184, 184, 185, 189, 137, 137, 145, 144, 146, 190, 144, 150, 137, 184, 137, 144, 183, 186, 146, 191, 137, 138, 189, 188, 186, 137, 184, 182, 146, 146, 185, 187, 190, 189, 137, 186, 182, 144, 188, 138, 146, 138, 137, 137, 189, 154, 185, 189, 188, 149, 188, 190, 182, 189, 150, 146, 189, 137, 182, 190, 190, 137, 136, 136, 188, 185, 185, 138, 148, 190, 143, 189, 137, 190, 190, 188, 191, 186, 184, 184, 190, 185, 136, 138, 136, 188, 184, 189, 137, 188, 137, 137, 189, 138, 184, 189, 139]
Average steps to solution per 100 episodes:  166.25


In [7]:
TD_n(3)



TD( 3 ) n-SARSA on-policy training for MountainCar-v0
Episode:  10000  time:  00:05:57
Episode:  20000  time:  00:11:19
Episode:  30000  time:  00:16:47
Episode:  40000  time:  00:22:12
Episode:  50000  time:  00:27:34
done!
[133, 134, 134, 133, 133, 133, 200, 133, 132, 200, 133, 134, 133, 200, 200, 132, 133, 133, 200, 200, 135, 133, 134, 133, 135, 131, 133, 132, 133, 134, 133, 134, 200, 133, 133, 133, 133, 134, 134, 133, 133, 134, 134, 131, 133, 134, 132, 133, 133, 134, 134, 132, 134, 200, 133, 132, 132, 134, 133, 134, 133, 133, 134, 200, 132, 132, 132, 132, 132, 135, 134, 133, 133, 135, 134, 132, 133, 200, 136, 133, 200, 132, 134, 133, 135, 200, 132, 132, 133, 200, 134, 135, 132, 134, 133, 133, 135, 133, 200, 133]
Average steps to solution per 100 episodes:  142.56


In [8]:
TD_n(4)



TD( 4 ) n-SARSA on-policy training for MountainCar-v0
Episode:  10000  time:  00:05:53
Episode:  20000  time:  00:11:29
Episode:  30000  time:  00:17:01
Episode:  40000  time:  00:22:32
Episode:  50000  time:  00:28:06
done!
[200, 200, 147, 200, 200, 200, 200, 200, 200, 200, 185, 200, 200, 200, 200, 200, 184, 200, 147, 200, 182, 200, 200, 200, 200, 200, 200, 200, 200, 184, 200, 200, 184, 185, 200, 200, 200, 185, 200, 200, 200, 200, 161, 200, 200, 184, 154, 200, 200, 184, 200, 184, 200, 200, 200, 200, 200, 200, 200, 184, 200, 200, 200, 200, 200, 200, 200, 183, 198, 200, 184, 200, 184, 200, 200, 148, 200, 200, 200, 200, 147, 200, 200, 184, 200, 200, 200, 147, 184, 200, 200, 183, 200, 185, 184, 200, 200, 200, 200, 200]
Average steps to solution per 100 episodes:  193.45
