Assignment 3 - Exercise 2

TD(n) n-step off-policy tree backup control

In [15]:
import gym
import time
import itertools
import random
import numpy as np
env = gym.make('MountainCar-v0')

In [19]:
bins = (30, 30)

def discretize_observation(observation):
  """
  Discretizes the observation space.

  Args:
    observation: The observation to discretize.

  Returns:
    The discretized observation.
  """


  low = env.observation_space.low
  high = env.observation_space.high

  return tuple(np.digitize(observation[i], np.linspace(low[i], high[i], bins[i] + 1)) - 1 for i in range(2))

In [20]:
# TD(n) n-step tree backup control (off-policy)

alpha = 0.1 #step size for incremental averaging
gamma = 1 #discount factor
n = 2 # number of steps
epsilon = 0.1

# formaulate the state space with every combination of the discritsized elements of the states
states = itertools.product(np.arange(bins[0]), np.arange(bins[1]))

policy = {}
q = {}
b = {}


for state in states:
  q[state] = np.zeros(env.action_space.n) # initialize the action-value function
  policy[state] = np.full(((env.action_space.n)),epsilon/env.action_space.n) # initialize the policy
  policy[state][np.random.randint(0,env.action_space.n)] += 1-epsilon # initialize a random policy

  b[state] = np.full(((env.action_space.n)),epsilon/env.action_space.n) # initialize the policy
  b[state][np.random.randint(0,env.action_space.n)] += 1-epsilon # initialize a random policy




episodes = 5e4
start_timer = time.time()
episode = 1

while episode < episodes:
  episode_tracker = []

  # reset the environment
  obs = env.reset()
  state = discretize_observation(obs) # discretize the observation

  action = np.random.choice(np.arange(env.action_space.n), p = b[state])
  # action = env.action_space.sample()

  episode_tracker.append([0,state,action])
  T = float('inf') # terminal state

  t = 0 #initialize the time-step counter t

  while True:

    if t<T:

      obs, reward, done, info = env.step(action) # taking the action
      next_state = discretize_observation(obs) # next state

      episode_tracker.append([reward, next_state]) # append the reward and next state to the episode tracker

      if done:
        T = t+1
      else:
        action = np.random.choice(np.arange(env.action_space.n), p = b[state])
        # action = env.action_space.sample()
        episode_tracker[-1].append(action)  # append the action to the episode tracker

    # the time step whose state estimate is being updated
    # (can be thought of us going n steps back from current state that is at t+1)
    tau = t - n + 1

    if tau >= 0:

      if t+1 >= T:
        G = episode_tracker[T][0] # terminal state reward

      else:

        G =  episode_tracker[t+1][0] + gamma * sum(policy[episode_tracker[t+1][1]][i] *
                                                   q[episode_tracker[t+1][1]][i] 
                                                   for i in range(env.action_space.n))

      for k in reversed(range(tau+1, min(t,T-1)+1)):
        r_k = episode_tracker[k][0]
        s_k = episode_tracker[k][1]
        a_k = episode_tracker[k][2]

        G = r_k + (gamma * sum(policy[s_k][i] * q[s_k][i] for i in range(env.action_space.n) 
                               if i != a_k)) + (gamma * policy[s_k][a_k] * G)

        # updating the state-action value function

      s = episode_tracker[tau][1]
      a = episode_tracker[tau][2]

      q[s][a] += alpha * (G - q[s][a])


      # updating the target policy
      policy[s] = np.full(((env.action_space.n)),epsilon/env.action_space.n)
      best_action = np.argmax(q[s])
      policy[s][best_action] += 1-epsilon


      # updating the backup policy
      b[s] = np.full(((env.action_space.n)),(5*epsilon)/env.action_space.n)
      best_action = np.argmax(q[s])
      b[s][best_action] += 1- (5*epsilon)

    if tau == T-1:  # when the update time reaches terminal state, break
      break

    state = next_state  # update the state

    t += 1 # updating the time- step

  episode += 1
  if episode % 10000 == 0:
    end_timer = time.time()
    timer = end_timer - start_timer
    elapsed_time_struct = time.gmtime(timer)
    formatted_time = time.strftime("%H:%M:%S", elapsed_time_struct)
    print("Episode: ",episode, " time: ", formatted_time)

  if episode == episodes:
    print("done!")

Episode:  10000  time:  00:07:26
Episode:  20000  time:  00:14:50
Episode:  30000  time:  00:22:15
Episode:  40000  time:  00:29:40
Episode:  50000  time:  00:37:06
done!


In [21]:
  #test
  steps_to_solution = []

  for j in range(100):
    obs = env.reset()
    state = discretize_observation(obs)
    for i in range(1,50000):
      action = np.argmax(policy[state])
      # print("step",i, "reward: ", reward," state: ", state," action=",action)
      obs, reward, done, info = env.step(action)
      state = discretize_observation(obs)

      if done:
        #print ("done")
        steps_to_solution.append(i)
        break
  print(steps_to_solution)
  avg_step = np.mean(steps_to_solution)
  print("Average steps to solution per 100 episodes: ",avg_step)

  # ipythondisplay.clear_output(wait=True)
  env.close()

[125, 162, 166, 166, 128, 164, 129, 128, 163, 127, 166, 125, 162, 163, 160, 128, 161, 164, 161, 164, 162, 163, 131, 126, 129, 164, 161, 165, 161, 129, 163, 123, 163, 161, 165, 127, 162, 186, 162, 128, 128, 160, 161, 127, 127, 186, 166, 129, 172, 132, 163, 162, 119, 128, 129, 129, 128, 129, 127, 178, 174, 130, 160, 160, 127, 162, 129, 160, 126, 132, 126, 128, 129, 128, 128, 164, 186, 129, 163, 160, 164, 126, 163, 128, 130, 126, 129, 127, 129, 162, 129, 186, 164, 130, 124, 165, 160, 128, 128, 129]
Average steps to solution per 100 episodes:  146.51
