# Use Closed-Form Policy to Play BipedalWalker-v3

In [1]:
import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import gym

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

In [2]:
env = gym.make('BipedalWalker-v3', new_step_api=True)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])

00:00:17 [INFO] env: <BipedalWalker<BipedalWalker-v3>>
00:00:17 [INFO] action_space: Box(-1.0, 1.0, (4,), float32)
00:00:17 [INFO] observation_space: Box(-inf, inf, (24,), float32)
00:00:17 [INFO] reward_range: (-inf, inf)
00:00:17 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}
00:00:17 [INFO] _max_episode_steps: 1600
00:00:17 [INFO] _elapsed_steps: None
00:00:17 [INFO] id: BipedalWalker-v3
00:00:17 [INFO] entry_point: gym.envs.box2d:BipedalWalker
00:00:17 [INFO] reward_threshold: 300
00:00:17 [INFO] nondeterministic: False
00:00:17 [INFO] max_episode_steps: 1600
00:00:17 [INFO] _kwargs: {}
00:00:17 [INFO] _env_name: BipedalWalker


In [3]:
class ClosedFormAgent:
    def __init__(self, env):
        self.weights = np.array([
            [ 0.9, -0.7,  0.0, -1.4],
            [ 4.3, -1.6, -4.4, -2.0],
            [ 2.4, -4.2, -1.3, -0.1],
            [-3.1, -5.0, -2.0, -3.3],
            [-0.8,  1.4,  1.7,  0.2],
            [-0.7,  0.2, -0.2,  0.1],
            [-0.6, -1.5, -0.6,  0.3],
            [-0.5, -0.3,  0.2,  0.1],
            [ 0.0, -0.1, -0.1,  0.1],
            [ 0.4,  0.8, -1.6, -0.5],
            [-0.4,  0.5, -0.3, -0.4],
            [ 0.3,  2.0,  0.9, -1.6],
            [ 0.0, -0.2,  0.1, -0.3],
            [ 0.1,  0.2, -0.5, -0.3],
            [ 0.7,  0.3,  5.1, -2.4],
            [-0.4, -2.3,  0.3, -4.0],
            [ 0.1, -0.8,  0.3,  2.5],
            [ 0.4, -0.9, -1.8,  0.3],
            [-3.9, -3.5,  2.8,  0.8],
            [ 0.4, -2.8,  0.4,  1.4],
            [-2.2, -2.1, -2.2, -3.2],
            [-2.7, -2.6,  0.3,  0.6],
            [ 2.0,  2.8,  0.0, -0.9],
            [-2.2,  0.6,  4.7, -4.6],
            ])
        self.bias = np.array([3.2, 6.1, -4.0, 7.6])

    def reset(self, mode=None):
        pass

    def step(self, observation, reward, termination):
        action = np.matmul(observation, self.weights) + self.bias
        return action

    def close(self):
        pass


agent = ClosedFormAgent(env)

In [4]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation = env.reset(seed=seed)
    reward, termination, truncation = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, termination)
        if render:
            env.render()
        if termination or truncation:
            break
        observation, reward, termination, truncation, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:00:17 [INFO] ==== test ====
00:00:18 [INFO] test episode 0: reward = 311.54, steps = 1238
00:00:18 [INFO] test episode 1: reward = 311.80, steps = 1246
00:00:19 [INFO] test episode 2: reward = 311.68, steps = 1297
00:00:20 [INFO] test episode 3: reward = 311.93, steps = 1257
00:00:20 [INFO] test episode 4: reward = 312.64, steps = 1279
00:00:21 [INFO] test episode 5: reward = 314.01, steps = 1244
00:00:22 [INFO] test episode 6: reward = 311.10, steps = 1242
00:00:22 [INFO] test episode 7: reward = 312.14, steps = 1286
00:00:23 [INFO] test episode 8: reward = 313.93, steps = 1227
00:00:24 [INFO] test episode 9: reward = 312.31, steps = 1248
00:00:24 [INFO] test episode 10: reward = 313.03, steps = 1243
00:00:25 [INFO] test episode 11: reward = 310.21, steps = 1269
00:00:26 [INFO] test episode 12: reward = 313.31, steps = 1246
00:00:26 [INFO] test episode 13: reward = 311.41, steps = 1246
00:00:27 [INFO] test episode 14: reward = 309.60, steps = 1274
00:00:27 [INFO] test episode 15: r