In [1]:
# pip install gym-maze-trustycoder83

In [None]:
import os
import warnings
import time
import random
import numpy as np
warnings.filterwarnings("ignore")
os.environ["SDL_VIDEODRIVER"] = "dummy"

In [None]:
import gym
import gym_maze

env = gym.make('maze-sample-5x5-v0')
obs = env.reset()

print(obs)

[0 0]


In [None]:
# action = 3

for i in range(10):
  next_obs, reward, done, _ = env.step(random.randint(0,3))

  # env.render()
  time.sleep(.5)
  print(next_obs)

[0 0]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[1 0]
[1 0]
[1 0]


# Interface

In [None]:
## initial_state = env.reset()
# initial_state - an initial state S_0 ~ P_0
# env.state = initial_state

## next_state, reward, done, info = env.step(action)
# action = a current action A_t
# next_state - an next state S_t+1 ~ P( S_t+1 | S_t, A_t)
# reward - a current reward R_t = R(S_t, A_t)
# done - the inclusion S_t из мн-ва S_f holds or not (True/False)
# info - an additional information
# env.state = next_state

In [None]:
state_n = 25
action_n = 4

class RandomAgent():
  def __init__(self, action_n):
    self.action_n = action_n
    return None
  
  def get_action(self, state):
    return random.randint(0, self.action_n-1)



class CEM():
  def __init__(self, state_n, action_n):
    self.state_n = state_n
    self.action_n = action_n
    self.policy = np.ones((self.state_n, self.action_n)) / self.action_n

  def get_action(self, state):
    return int(np.random.choice(np.arange(self.action_n), p=self.policy[state]))

  def update_policy(self, elite_trajectories):
    pre_policy = np.zeros((self.state_n, self.action_n))
    
    for trajectory in elite_trajectories:
      for state, action in zip(trajectory['states'], trajectory['actions']):
        pre_policy[state][action] += 1

      for state in range(self.state_n):
        if sum(pre_policy[state]) == 0:
          self.policy[state] = np.ones(self.action_n) / self.action_n
        else:
          self.policy[state] = pre_policy[state] / sum(pre_policy[state])
    return None


def get_state(obs):
  return int(obs[1]*np.sqrt(state_n) + obs[0])

def get_trajectory(agent, trajectory_len):
  trajectory = {'states': [], 'actions': [], 'total_reward': 0}

  obs = env.reset()
  state = get_state(obs)

  for i in range(trajectory_len):
    trajectory['states'].append(state)
    action = agent.get_action(state)
    trajectory['actions'].append(action)

    obs, reward, done, _ = env.step(action)
    state = get_state(obs)
    trajectory['total_reward'] += reward

    if done:
      break
    
  return trajectory



def get_elite_trajectories(trajectories, q_param):
  total_rewards = [trajectory['total_reward'] for trajectory in trajectories]
  quantile = np.quantile(total_rewards, q_param)
  return [trajectory for trajectory in trajectories if trajectory['total_reward'] > quantile]



# agent = RandomAgent(action_n)
agent = CEM(state_n, action_n)
episode_n = 50
trajectory_n = 100
trajectory_len = 100
q_param = 0.9

for _ in range(episode_n):
  trajectories = [get_trajectory(agent, trajectory_len) for _ in range(trajectory_n)]
  mean_total_reward = np.mean([trajectory['total_reward'] for trajectory in trajectories])
  print(mean_total_reward)

  elite_trajectories = get_elite_trajectories(trajectories, q_param)
  
  if len(elite_trajectories) > 0:
    agent.update_policy(elite_trajectories)





-0.3675600000000003
0.5219999999999998
0.8588
0.91644
0.9369199999999998
0.9361599999999999
0.93684
0.9366799999999997
0.9366
0.9373199999999997
0.9363199999999999
0.9364399999999997
0.9362799999999999
0.9364399999999997
0.9369999999999998
0.9364399999999999
0.9374399999999997
0.9370799999999998
0.9368000000000001
0.9369199999999998
0.9358000000000001
0.9365599999999997
0.9369999999999998
0.9363999999999999
0.9373999999999998
0.9372399999999999
0.9366399999999999
0.9369999999999998
0.9363600000000002
0.9371599999999998
0.9368799999999999
0.9362799999999999
0.9364
0.9365199999999999
0.9364799999999999
0.9371599999999998
0.9369200000000001
0.9369599999999998
0.93696
0.9367999999999997
0.9371599999999998
0.9372399999999999
0.9366399999999999
0.9367999999999997
0.9360400000000001
0.9365999999999999
0.93628
0.9376000000000002
0.93624
0.9371199999999998


In [None]:
trajectory = get_trajectory(agent, trajectory_len)
trajectory

{'states': [0, 1, 2, 3, 8, 7, 12, 11, 10, 15, 20, 21, 22, 17, 18, 23],
 'actions': [3, 3, 3, 1, 2, 1, 2, 2, 1, 1, 3, 3, 0, 3, 1, 3],
 'total_reward': 0.94}