# Use Closed-Form Policy to Play FrozenLake-v1

In [1]:
import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import gym

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

In [2]:
env = gym.make('FrozenLake-v1')
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
    logging.info('%s: %s', key, vars(env.unwrapped)[key])

00:00:00 [INFO] id: FrozenLake-v1
00:00:00 [INFO] entry_point: gym.envs.toy_text:FrozenLakeEnv
00:00:00 [INFO] reward_threshold: 0.7
00:00:00 [INFO] nondeterministic: False
00:00:00 [INFO] max_episode_steps: 100
00:00:00 [INFO] order_enforce: True
00:00:00 [INFO] _kwargs: {'map_name': '4x4'}
00:00:00 [INFO] _env_name: FrozenLake
00:00:00 [INFO] desc: [[b'S' b'F' b'F' b'F']
 [b'F' b'H' b'F' b'H']
 [b'F' b'F' b'F' b'H']
 [b'H' b'F' b'F' b'G']]
00:00:00 [INFO] nrow: 4
00:00:00 [INFO] ncol: 4
00:00:00 [INFO] reward_range: (0, 1)
00:00:00 [INFO] P: {0: {0: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 4, 0.0, False)], 1: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 4, 0.0, False), (0.3333333333333333, 1, 0.0, False)], 2: [(0.3333333333333333, 4, 0.0, False), (0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False)], 3: [(0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333

In [3]:
class ClosedFormAgent:
    def __init__(self, env):
        state_n, action_n = env.observation_space.n, env.action_space.n
        v = np.zeros((env.spec.max_episode_steps+1, state_n))
        q = np.zeros((env.spec.max_episode_steps+1, state_n, action_n))
        pi = np.zeros((env.spec.max_episode_steps+1, state_n))
        for t in range(env.spec.max_episode_steps-1, -1, -1):
            for s in range(state_n):
                for a in range(action_n):
                    for p, next_s, r, d in env.P[s][a]:
                        q[t, s, a] += p * (r + (1. - float(d)) * v[t+1, next_s])
                v[t, s] = q[t, s].max()
                pi[t, s] = q[t, s].argmax()
        self.pi = pi

    def reset(self, mode=None):
        self.t = 0

    def step(self, observation, reward, terminated):
        action = self.pi[self.t, observation]
        self.t += 1
        return action

    def close(self):
        pass


agent = ClosedFormAgent(env)

In [4]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:00:00 [INFO] ==== test ====
00:00:00 [INFO] test episode 0: reward = 1.00, steps = 14
00:00:00 [INFO] test episode 1: reward = 1.00, steps = 11
00:00:00 [INFO] test episode 2: reward = 1.00, steps = 31
00:00:00 [INFO] test episode 3: reward = 0.00, steps = 14
00:00:00 [INFO] test episode 4: reward = 0.00, steps = 9
00:00:00 [INFO] test episode 5: reward = 1.00, steps = 93
00:00:00 [INFO] test episode 6: reward = 1.00, steps = 100
00:00:00 [INFO] test episode 7: reward = 1.00, steps = 34
00:00:00 [INFO] test episode 8: reward = 1.00, steps = 86
00:00:00 [INFO] test episode 9: reward = 1.00, steps = 26
00:00:00 [INFO] test episode 10: reward = 0.00, steps = 16
00:00:00 [INFO] test episode 11: reward = 1.00, steps = 57
00:00:00 [INFO] test episode 12: reward = 0.00, steps = 98
00:00:00 [INFO] test episode 13: reward = 0.00, steps = 10
00:00:00 [INFO] test episode 14: reward = 0.00, steps = 100
00:00:00 [INFO] test episode 15: reward = 1.00, steps = 7
00:00:00 [INFO] test episode 16: re

In [5]:
env.close()