@author: Gediyon M. Girma

on-policy expected SARSA control for MountainCar environment

In [7]:
import gym
import time
import itertools
import random
import numpy as np
env = gym.make('MountainCar-v0')

In [8]:
# discretization of the observation space

bins = (30, 30)

def discretize_observation(observation):
  """
  Discretizes the observation space.

  Args:
    observation: The observation to discretize.

  Returns:
    The discretized observation.
  """

  low = env.observation_space.low
  high = env.observation_space.high

  return tuple(np.digitize(observation[i], np.linspace(low[i], high[i], bins[i] + 1)) - 1 for i in range(2))

In [18]:
# the on-policy SARSA
alpha = 0.1
gamma = 1
epsilon = 0.1

# formaulate the state space with every combination of the discritsized elements of the states
states = itertools.product(np.arange(bins[0]), np.arange(bins[1]))

q={}
policy = {}


for state in states:
  q[state] = np.zeros((env.action_space.n)) # initialize the action-value function

  # initializing the epsilon-greedy policy
  policy[state] = np.full(((env.action_space.n)),epsilon/env.action_space.n)
  policy[state][np.random.randint(3)] += 1-epsilon



In [22]:
 # training
episodes = 5e4
start_timer = time.time()
episode = 1
while episode < episodes:

  # reset the environment
  obs = env.reset()
  state = discretize_observation(obs) # discretize the observation

  for t in range(1, 500):
    action = np.random.choice(np.arange(env.action_space.n), p = policy[state]) # select an action
    obs, reward, done, info = env.step(action)
    next_state = discretize_observation(obs) # next state
    next_action = np.random.choice(np.arange(env.action_space.n), p = policy[state]) # next action

    Expected_value = sum([policy[next_state][action]*q[next_state][next_action]
                          for action in range(env.action_space.n)])
    q[state][action] += alpha * (reward + gamma * Expected_value - q[state][action]) # update the action-value function

    # update the policy based on the action-value function
    best_action = np.argmax(q[state]) # selecting the best action based on the action-value function
    policy[state] = np.full(((env.action_space.n)),epsilon/env.action_space.n) # resetting the policy for the given state
    policy[state][best_action] += 1 - epsilon # setting the probability of the best action to 1

    state = next_state
    action = next_action

    if done:
      break


  episode += 1
  if episode % 1000 == 0:
    end_timer = time.time()
    timer = end_timer - start_timer
    elapsed_time_struct = time.gmtime(timer)
    formatted_time = time.strftime("%H:%M:%S", elapsed_time_struct)
    print("Episode: ",episode, " time: ", formatted_time)

  if episode == episodes:
    print("done!")

Episode:  1000  time:  00:00:21
Episode:  2000  time:  00:00:45
Episode:  3000  time:  00:01:06
Episode:  4000  time:  00:01:28
Episode:  5000  time:  00:01:50
Episode:  6000  time:  00:02:11
Episode:  7000  time:  00:02:36
Episode:  8000  time:  00:02:58
Episode:  9000  time:  00:03:19
Episode:  10000  time:  00:03:40
Episode:  11000  time:  00:04:04
Episode:  12000  time:  00:04:29
Episode:  13000  time:  00:04:51
Episode:  14000  time:  00:05:11
Episode:  15000  time:  00:05:32
Episode:  16000  time:  00:05:55
Episode:  17000  time:  00:06:16
Episode:  18000  time:  00:06:38
Episode:  19000  time:  00:07:00
Episode:  20000  time:  00:07:22
Episode:  21000  time:  00:07:42
Episode:  22000  time:  00:08:03
Episode:  23000  time:  00:08:24
Episode:  24000  time:  00:08:45
Episode:  25000  time:  00:09:06
Episode:  26000  time:  00:09:27
Episode:  27000  time:  00:09:47
Episode:  28000  time:  00:10:08
Episode:  29000  time:  00:10:29
Episode:  30000  time:  00:10:50
Episode:  31000  ti

In [23]:
#test
steps_to_solution = []

for j in range(100):
  obs = env.reset()
  state = discretize_observation(obs)
  for i in range(1,50000):
    action = np.argmax(policy[state])
    # print("step",i, "reward: ", reward," state: ", state," action=",action)
    obs, reward, done, info = env.step(action)
    state = discretize_observation(obs)

    if done:
      #print ("done")
      steps_to_solution.append(i)
      break
print(steps_to_solution)
avg_step = np.mean(steps_to_solution)
print("Average steps to solution: ",avg_step)

# ipythondisplay.clear_output(wait=True)
env.close()

[107, 125, 108, 155, 131, 107, 148, 137, 145, 146, 107, 131, 108, 149, 148, 149, 107, 131, 130, 108, 149, 153, 141, 127, 129, 131, 147, 107, 149, 131, 148, 127, 128, 147, 155, 148, 148, 130, 152, 148, 149, 129, 131, 127, 129, 141, 126, 132, 107, 129, 131, 150, 107, 128, 107, 153, 152, 125, 107, 147, 131, 130, 107, 129, 146, 129, 108, 129, 107, 127, 146, 149, 107, 107, 107, 107, 107, 142, 148, 153, 132, 151, 147, 154, 128, 155, 107, 107, 149, 149, 127, 108, 128, 148, 132, 150, 150, 107, 130, 144]
Average steps to solution:  131.98


In [27]:
# test
obs = env.reset()
state = discretize_observation(obs)
reward = 0
for i in range(50000):
  action = np.argmax(policy[state])
  print("step",i, "reward: ", reward," state: ", state," action=",action)
  obs, reward, done, info = env.step(action)
  state = discretize_observation(obs)

  if done:
    print ("done")
    break

# ipythondisplay.clear_output(wait=True)
env.close()
print("Iterations that were run:",i)

step 0 reward:  0  state:  (12, 15)  action= 0
step 1 reward:  -1.0  state:  (12, 14)  action= 0
step 2 reward:  -1.0  state:  (12, 14)  action= 0
step 3 reward:  -1.0  state:  (12, 14)  action= 0
step 4 reward:  -1.0  state:  (12, 13)  action= 0
step 5 reward:  -1.0  state:  (11, 13)  action= 0
step 6 reward:  -1.0  state:  (11, 13)  action= 0
step 7 reward:  -1.0  state:  (11, 12)  action= 0
step 8 reward:  -1.0  state:  (11, 12)  action= 0
step 9 reward:  -1.0  state:  (11, 12)  action= 0
step 10 reward:  -1.0  state:  (11, 12)  action= 0
step 11 reward:  -1.0  state:  (10, 12)  action= 0
step 12 reward:  -1.0  state:  (10, 11)  action= 0
step 13 reward:  -1.0  state:  (10, 11)  action= 0
step 14 reward:  -1.0  state:  (10, 11)  action= 0
step 15 reward:  -1.0  state:  (9, 11)  action= 0
step 16 reward:  -1.0  state:  (9, 11)  action= 0
step 17 reward:  -1.0  state:  (9, 11)  action= 0
step 18 reward:  -1.0  state:  (8, 11)  action= 0
step 19 reward:  -1.0  state:  (8, 11)  action= 