# Use Closed-Form Policy to Play Taxi-v3

In [1]:
import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import gym

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

In [2]:
env = gym.make('Taxi-v3', new_step_api=True)
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
    logging.info('%s: %s', key, vars(env.unwrapped)[key])

00:00:00 [INFO] id: Taxi-v3
00:00:00 [INFO] entry_point: gym.envs.toy_text:TaxiEnv
00:00:00 [INFO] reward_threshold: 8
00:00:00 [INFO] nondeterministic: False
00:00:00 [INFO] max_episode_steps: 200
00:00:00 [INFO] order_enforce: True
00:00:00 [INFO] _kwargs: {}
00:00:00 [INFO] _env_name: Taxi
00:00:00 [INFO] desc: [[b'+' b'-' b'-' b'-' b'-' b'-' b'-' b'-' b'-' b'-' b'+']
 [b'|' b'R' b':' b' ' b'|' b' ' b':' b' ' b':' b'G' b'|']
 [b'|' b' ' b':' b' ' b'|' b' ' b':' b' ' b':' b' ' b'|']
 [b'|' b' ' b':' b' ' b':' b' ' b':' b' ' b':' b' ' b'|']
 [b'|' b' ' b'|' b' ' b':' b' ' b'|' b' ' b':' b' ' b'|']
 [b'|' b'Y' b'|' b' ' b':' b' ' b'|' b'B' b':' b' ' b'|']
 [b'+' b'-' b'-' b'-' b'-' b'-' b'-' b'-' b'-' b'-' b'+']]
00:00:00 [INFO] locs: [(0, 0), (0, 4), (4, 0), (4, 3)]
00:00:00 [INFO] P: {0: {0: [(1.0, 100, -1, False)], 1: [(1.0, 0, -1, False)], 2: [(1.0, 20, -1, False)], 3: [(1.0, 0, -1, False)], 4: [(1.0, 16, -1, False)], 5: [(1.0, 0, -10, False)]}, 1: {0: [(1.0, 101, -1, False)], 1: [

00:00:00 [INFO] isd: [0.         0.00333333 0.00333333 0.00333333 0.00333333 0.
 0.00333333 0.00333333 0.00333333 0.00333333 0.         0.00333333
 0.00333333 0.00333333 0.00333333 0.         0.         0.
 0.         0.         0.         0.00333333 0.00333333 0.00333333
 0.00333333 0.         0.00333333 0.00333333 0.00333333 0.00333333
 0.         0.00333333 0.00333333 0.00333333 0.00333333 0.
 0.         0.         0.         0.         0.         0.00333333
 0.00333333 0.00333333 0.00333333 0.         0.00333333 0.00333333
 0.00333333 0.00333333 0.         0.00333333 0.00333333 0.00333333
 0.00333333 0.         0.         0.         0.         0.
 0.         0.00333333 0.00333333 0.00333333 0.00333333 0.
 0.00333333 0.00333333 0.00333333 0.00333333 0.         0.00333333
 0.00333333 0.00333333 0.00333333 0.         0.         0.
 0.         0.         0.         0.00333333 0.00333333 0.00333333
 0.00333333 0.         0.00333333 0.00333333 0.00333333 0.00333333
 0.         0.00333333

In [3]:
class ClosedFormAgent:
    def __init__(self, env):
        state_n, action_n = env.observation_space.n, env.action_space.n
        v = np.zeros((env.spec.max_episode_steps+1, state_n))
        q = np.zeros((env.spec.max_episode_steps+1, state_n, action_n))
        pi = np.zeros((env.spec.max_episode_steps+1, state_n))
        for t in range(env.spec.max_episode_steps-1, -1, -1):
            for s in range(state_n):
                for a in range(action_n):
                    for p, next_s, r, d in env.P[s][a]:
                        q[t, s, a] += p * (r + (1. - float(d)) * v[t+1, next_s])
                v[t, s] = q[t, s].max()
                pi[t, s] = q[t, s].argmax()
        self.pi = pi

    def reset(self, mode=None):
        self.t = 0

    def step(self, observation, reward, termination):
        action = self.pi[self.t, observation]
        self.t += 1
        return action

    def close(self):
        pass


agent = ClosedFormAgent(env)

In [4]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation = env.reset(seed=seed)
    reward, termination, truncation = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, termination)
        if render:
            env.render()
        if termination or truncation:
            break
        observation, reward, termination, truncation, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:00:01 [INFO] ==== test ====
00:00:01 [INFO] test episode 0: reward = 4.00, steps = 17
00:00:01 [INFO] test episode 1: reward = 10.00, steps = 11
00:00:01 [INFO] test episode 2: reward = 10.00, steps = 11
00:00:01 [INFO] test episode 3: reward = 11.00, steps = 10
00:00:01 [INFO] test episode 4: reward = 8.00, steps = 13
00:00:01 [INFO] test episode 5: reward = 9.00, steps = 12
00:00:01 [INFO] test episode 6: reward = 11.00, steps = 10
00:00:01 [INFO] test episode 7: reward = 6.00, steps = 15
00:00:01 [INFO] test episode 8: reward = 9.00, steps = 12
00:00:01 [INFO] test episode 9: reward = 13.00, steps = 8
00:00:01 [INFO] test episode 10: reward = 11.00, steps = 10
00:00:01 [INFO] test episode 11: reward = 9.00, steps = 12
00:00:01 [INFO] test episode 12: reward = 4.00, steps = 17
00:00:01 [INFO] test episode 13: reward = 8.00, steps = 13
00:00:01 [INFO] test episode 14: reward = 13.00, steps = 8
00:00:01 [INFO] test episode 15: reward = 4.00, steps = 17
00:00:01 [INFO] test episode 1

In [5]:
env.close()