In this assignment you will learn how to apply the REINFORCE algorithm within the OpenAI Gym environment. Make sure OpenAI gym is installed on your machine. Now let's import some relevant packages.

In [0]:
import gym
from gym import wrappers, logger
import matplotlib.pyplot as plt
import tqdm
import numpy as np
from chainer import Chain
import chainer.links as L
import chainer.functions as F
from chainer.optimizers import Adam
from chainer import Variable

%matplotlib inline

We will make use of the classic CartPole environment provided by OpenAI Gym. Figure out what the details of this environment are.

In [0]:
env_id = 'CartPole-v0'

# You can set the level to logger.DEBUG or logger.WARN if you want to change the amount of output.
logger.set_level(logger.INFO)

Let's define a baseline agent which just emits random actions.

In [0]:
class RandomAgent(object):
    """The world's simplest agent!"""

    def __init__(self, action_space):
        self.action_space = action_space

    def act(self, observation, reward, done):
        return self.action_space.sample()


Let's run the agent on the environment.

In [0]:
env = gym.make(env_id)
env.seed(0)
agent = RandomAgent(env.action_space)

episode_count = 1000
done = False
reward = 0
    
R0 = np.zeros(episode_count)
for i in tqdm.trange(episode_count):

    ob = env.reset()

    while True:

        action = agent.act(ob, reward, done)
        ob, reward, done, _ = env.step(action)

        R0[i] += reward

        if done:
            break

# Close the env and write monitor result info to disk
env.close()

  0%|          | 0/1000 [00:00<?, ?it/s]

 46%|████▌     | 460/1000 [00:00<00:00, 4577.79it/s]

 91%|█████████▏| 914/1000 [00:00<00:00, 4551.34it/s]

INFO: Making new env: CartPole-v0


100%|██████████| 1000/1000 [00:00<00:00, 4470.73it/s]




Let's create the REINFORCE agent. We assume that the policy is computed using an MLP with a softmax output.

In [0]:
class MLP(Chain):
    """Multilayer perceptron"""

    def __init__(self, n_output=1, n_hidden=5):
        super(MLP, self).__init__(l1=L.Linear(None, n_hidden), l2=L.Linear(n_hidden, n_output))

    def __call__(self, x):
        return self.l2(F.relu(self.l1(x)))

1: A skeleton for the REINFORCEAgent is given. Implement the compute_loss and compute_score functions. 

In [0]:
class REINFORCEAgent(object):
    """Agent trained using REINFORCE"""

    def __init__(self, action_space, model, optimizer=Adam()):

        self.action_space = action_space

        self.model = model

        self.optimizer = optimizer
        self.optimizer.setup(self.model)

        # monitor score and reward
        self.rewards = []
        self.scores = []


    def act(self, observation, reward, done):

        # linear outputs reflecting the log action probabilities and the value
        policy = self.model(Variable(np.atleast_2d(np.asarray(observation, 'float32'))))

        # generate action according to policy
        p = F.softmax(policy).data

        # normalize p in case tiny floating precision problems occur
        row_sums = p.sum(axis=1)
        p /= row_sums[:, np.newaxis]

        action = np.asarray([np.random.choice(p.shape[1], None, True, p[0])])

        return action, policy


    def compute_loss(self):
        """
        Return loss for this episode based on computed scores and accumulated rewards
        """
    
        pass

    def compute_score(self, action, policy):
        """
        Computes score

        Args:
            action (int):
            policy:

        Returns:
            score
        """

        pass

Now we run the REINFORCE agent on the CartPole environment. Note that we update the agent after each episode for simplicity.

In [0]:
env = gym.make(env_id)
env.seed(0)

network = MLP(n_output=env.action_space.n, n_hidden=3)
agent = REINFORCEAgent(env.action_space, network, optimizer=Adam())

episode_count = 1000
done = False
reward = 0
    
R = np.zeros(episode_count)
for i in tqdm.trange(episode_count):

    ob = env.reset()

    loss = 0
    while True:

        action, policy = agent.act(ob, reward, done)

        ob, reward, done, _ = env.step(action[0])

        # get reward associated with taking the previous action in the previous state
        agent.rewards.append(reward)
        R[i] += reward

        # recompute score function: grad_theta log pi_theta (s_t, a_t) * v_t
        agent.scores.append(agent.compute_score(action, policy))

        # we learn at the end of each episode
        if done:
            
            loss += agent.compute_loss()
            
            agent.model.cleargrads()
            loss.backward()
            loss.unchain_backward()
            agent.optimizer.update()

            break

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 3/1000 [00:00<00:41, 24.06it/s]

INFO: Making new env: CartPole-v0


  0%|          | 5/1000 [00:00<00:44, 22.17it/s]

  1%|          | 8/1000 [00:00<00:44, 22.11it/s]

  1%|          | 11/1000 [00:00<00:48, 20.58it/s]

  1%|▏         | 14/1000 [00:00<00:44, 21.94it/s]

  2%|▏         | 18/1000 [00:00<00:42, 23.20it/s]

  2%|▏         | 21/1000 [00:00<00:42, 22.87it/s]

  2%|▎         | 25/1000 [00:01<00:40, 23.80it/s]

  3%|▎         | 28/1000 [00:01<00:41, 23.16it/s]

  3%|▎         | 32/1000 [00:01<00:40, 24.06it/s]

  4%|▎         | 35/1000 [00:01<00:39, 24.18it/s]

  4%|▍         | 38/1000 [00:01<00:39, 24.50it/s]

  4%|▍         | 41/1000 [00:01<00:40, 23.73it/s]

  4%|▍         | 44/1000 [00:01<00:41, 22.85it/s]

  5%|▍         | 47/1000 [00:02<00:41, 23.13it/s]

  5%|▌         | 52/1000 [00:02<00:39, 24.23it/s]

  6%|▌         | 56/1000 [00:02<00:38, 24.78it/s]

  6%|▌         | 60/1000 [00:02<00:38, 24.68it/s]

  6%|▋         | 63/1000 [00:02<00:38, 24.46it/s]

  7%|▋         | 66/1000 [00:02<00:38, 24.45it/s]

  7%|▋         | 69/1000 [00:02<00:38, 24.46it/s]

  7%|▋         | 72/1000 [00:02<00:37, 24.45it/s]

  8%|▊         | 76/1000 [00:03<00:37, 24.85it/s]

  8%|▊         | 79/1000 [00:03<00:37, 24.70it/s]

  8%|▊         | 82/1000 [00:03<00:37, 24.36it/s]

  8%|▊         | 85/1000 [00:03<00:37, 24.18it/s]

  9%|▉         | 88/1000 [00:03<00:37, 24.19it/s]

  9%|▉         | 92/1000 [00:03<00:37, 24.50it/s]

 10%|▉         | 96/1000 [00:04<00:37, 23.80it/s]

 10%|█         | 100/1000 [00:04<00:37, 24.04it/s]

 10%|█         | 104/1000 [00:04<00:37, 24.19it/s]

 11%|█         | 107/1000 [00:04<00:37, 23.92it/s]

 11%|█         | 112/1000 [00:04<00:36, 24.40it/s]

 12%|█▏        | 115/1000 [00:04<00:36, 24.30it/s]

 12%|█▏        | 120/1000 [00:04<00:35, 24.81it/s]

 12%|█▏        | 124/1000 [00:04<00:35, 24.92it/s]

 13%|█▎        | 128/1000 [00:05<00:35, 24.66it/s]

 13%|█▎        | 132/1000 [00:05<00:34, 24.85it/s]

 14%|█▎        | 136/1000 [00:05<00:35, 24.57it/s]

 14%|█▍        | 139/1000 [00:05<00:34, 24.64it/s]

 14%|█▍        | 142/1000 [00:05<00:34, 24.53it/s]

 14%|█▍        | 145/1000 [00:05<00:34, 24.61it/s]

 15%|█▍        | 149/1000 [00:06<00:34, 24.69it/s]

 15%|█▌        | 152/1000 [00:06<00:34, 24.71it/s]

 16%|█▌        | 155/1000 [00:06<00:34, 24.43it/s]

 16%|█▌        | 158/1000 [00:06<00:34, 24.38it/s]

 16%|█▌        | 161/1000 [00:06<00:34, 24.19it/s]

 16%|█▋        | 164/1000 [00:06<00:34, 24.00it/s]

 17%|█▋        | 169/1000 [00:06<00:34, 24.23it/s]

 17%|█▋        | 173/1000 [00:07<00:33, 24.38it/s]

 18%|█▊        | 176/1000 [00:07<00:33, 24.38it/s]

 18%|█▊        | 179/1000 [00:07<00:33, 24.34it/s]

 18%|█▊        | 182/1000 [00:07<00:33, 24.34it/s]

 19%|█▊        | 186/1000 [00:07<00:33, 24.51it/s]

 19%|█▉        | 190/1000 [00:07<00:32, 24.70it/s]

 19%|█▉        | 194/1000 [00:07<00:32, 24.46it/s]

 20%|█▉        | 197/1000 [00:08<00:32, 24.52it/s]

 20%|██        | 200/1000 [00:08<00:32, 24.33it/s]

 20%|██        | 203/1000 [00:08<00:32, 24.30it/s]

 21%|██        | 206/1000 [00:08<00:32, 24.22it/s]

 21%|██        | 210/1000 [00:08<00:32, 24.29it/s]

 21%|██▏       | 214/1000 [00:08<00:32, 24.37it/s]

 22%|██▏       | 218/1000 [00:08<00:31, 24.54it/s]

 22%|██▏       | 221/1000 [00:08<00:31, 24.59it/s]

 22%|██▏       | 224/1000 [00:09<00:31, 24.64it/s]

 23%|██▎       | 227/1000 [00:09<00:31, 24.55it/s]

 23%|██▎       | 230/1000 [00:09<00:31, 24.48it/s]

 23%|██▎       | 234/1000 [00:09<00:31, 24.57it/s]

 24%|██▎       | 237/1000 [00:09<00:31, 24.46it/s]

 24%|██▍       | 241/1000 [00:09<00:30, 24.62it/s]

 24%|██▍       | 245/1000 [00:09<00:30, 24.60it/s]

 25%|██▍       | 249/1000 [00:10<00:30, 24.75it/s]

 25%|██▌       | 253/1000 [00:10<00:30, 24.62it/s]

 26%|██▌       | 256/1000 [00:10<00:30, 24.59it/s]

 26%|██▌       | 259/1000 [00:10<00:30, 24.56it/s]

 26%|██▋       | 263/1000 [00:10<00:29, 24.70it/s]

 27%|██▋       | 266/1000 [00:10<00:29, 24.65it/s]

 27%|██▋       | 269/1000 [00:10<00:29, 24.68it/s]

 27%|██▋       | 272/1000 [00:11<00:29, 24.55it/s]

 28%|██▊       | 275/1000 [00:11<00:29, 24.54it/s]

 28%|██▊       | 279/1000 [00:11<00:29, 24.66it/s]

 28%|██▊       | 282/1000 [00:11<00:29, 24.57it/s]

 28%|██▊       | 285/1000 [00:11<00:29, 24.34it/s]

 29%|██▉       | 288/1000 [00:11<00:29, 24.13it/s]

 29%|██▉       | 292/1000 [00:12<00:29, 24.21it/s]

 30%|██▉       | 295/1000 [00:12<00:29, 24.06it/s]

 30%|██▉       | 299/1000 [00:12<00:29, 24.12it/s]

 30%|███       | 302/1000 [00:12<00:29, 24.06it/s]

 31%|███       | 306/1000 [00:12<00:28, 24.12it/s]

 31%|███       | 310/1000 [00:12<00:28, 24.22it/s]

 31%|███▏      | 313/1000 [00:13<00:28, 24.01it/s]

 32%|███▏      | 317/1000 [00:13<00:28, 24.12it/s]

 32%|███▏      | 320/1000 [00:13<00:28, 24.14it/s]

 32%|███▏      | 323/1000 [00:13<00:28, 24.06it/s]

 33%|███▎      | 326/1000 [00:13<00:28, 24.06it/s]

 33%|███▎      | 329/1000 [00:13<00:27, 24.09it/s]

 33%|███▎      | 332/1000 [00:13<00:28, 23.79it/s]

 34%|███▎      | 335/1000 [00:14<00:27, 23.77it/s]

 34%|███▍      | 338/1000 [00:14<00:27, 23.74it/s]

 34%|███▍      | 342/1000 [00:14<00:27, 23.63it/s]

 34%|███▍      | 344/1000 [00:14<00:27, 23.54it/s]

 35%|███▍      | 346/1000 [00:14<00:28, 23.30it/s]

 35%|███▍      | 349/1000 [00:15<00:28, 23.21it/s]

 35%|███▌      | 351/1000 [00:15<00:28, 23.12it/s]

 35%|███▌      | 353/1000 [00:15<00:28, 23.05it/s]

 36%|███▌      | 355/1000 [00:15<00:28, 22.88it/s]

 36%|███▌      | 359/1000 [00:15<00:27, 22.90it/s]

 36%|███▌      | 361/1000 [00:15<00:28, 22.80it/s]

 36%|███▋      | 363/1000 [00:15<00:27, 22.78it/s]

 36%|███▋      | 365/1000 [00:16<00:28, 22.67it/s]

 37%|███▋      | 367/1000 [00:16<00:28, 22.53it/s]

 37%|███▋      | 369/1000 [00:16<00:28, 22.51it/s]

 37%|███▋      | 371/1000 [00:16<00:28, 22.46it/s]

 37%|███▋      | 374/1000 [00:16<00:27, 22.46it/s]

 38%|███▊      | 377/1000 [00:16<00:27, 22.44it/s]

 38%|███▊      | 380/1000 [00:16<00:27, 22.38it/s]

 38%|███▊      | 382/1000 [00:17<00:27, 22.36it/s]

 38%|███▊      | 385/1000 [00:17<00:27, 22.27it/s]

 39%|███▉      | 389/1000 [00:17<00:27, 22.22it/s]

 39%|███▉      | 391/1000 [00:17<00:27, 22.21it/s]

 39%|███▉      | 393/1000 [00:17<00:27, 21.96it/s]

 40%|███▉      | 397/1000 [00:18<00:27, 22.06it/s]

 40%|████      | 400/1000 [00:18<00:27, 22.05it/s]

 40%|████      | 403/1000 [00:18<00:27, 21.99it/s]

 40%|████      | 405/1000 [00:18<00:27, 21.90it/s]

 41%|████      | 408/1000 [00:18<00:26, 21.94it/s]

 41%|████      | 411/1000 [00:18<00:26, 21.91it/s]

 41%|████▏     | 414/1000 [00:18<00:26, 21.91it/s]

 42%|████▏     | 417/1000 [00:19<00:26, 21.83it/s]

 42%|████▏     | 419/1000 [00:19<00:26, 21.81it/s]

 42%|████▏     | 421/1000 [00:19<00:26, 21.66it/s]

 42%|████▏     | 423/1000 [00:19<00:26, 21.65it/s]

 42%|████▎     | 425/1000 [00:19<00:26, 21.52it/s]

 43%|████▎     | 428/1000 [00:19<00:26, 21.55it/s]

 43%|████▎     | 431/1000 [00:19<00:26, 21.59it/s]

 43%|████▎     | 434/1000 [00:20<00:26, 21.41it/s]

 44%|████▎     | 436/1000 [00:20<00:26, 21.32it/s]

 44%|████▍     | 439/1000 [00:20<00:26, 21.34it/s]

 44%|████▍     | 442/1000 [00:20<00:26, 21.28it/s]

 44%|████▍     | 444/1000 [00:20<00:26, 21.27it/s]

 45%|████▍     | 447/1000 [00:20<00:25, 21.30it/s]

 45%|████▌     | 450/1000 [00:21<00:26, 21.13it/s]

 45%|████▌     | 452/1000 [00:21<00:26, 20.98it/s]

 45%|████▌     | 454/1000 [00:21<00:26, 20.95it/s]

 46%|████▌     | 457/1000 [00:21<00:25, 20.94it/s]

 46%|████▌     | 459/1000 [00:21<00:25, 20.92it/s]

 46%|████▌     | 462/1000 [00:22<00:25, 20.93it/s]

 46%|████▋     | 464/1000 [00:22<00:25, 20.84it/s]

 47%|████▋     | 467/1000 [00:22<00:25, 20.85it/s]

 47%|████▋     | 470/1000 [00:22<00:25, 20.88it/s]

 47%|████▋     | 473/1000 [00:22<00:25, 20.85it/s]

 48%|████▊     | 475/1000 [00:22<00:25, 20.79it/s]

 48%|████▊     | 477/1000 [00:22<00:25, 20.77it/s]

 48%|████▊     | 479/1000 [00:23<00:25, 20.73it/s]

 48%|████▊     | 482/1000 [00:23<00:24, 20.75it/s]

 48%|████▊     | 484/1000 [00:23<00:24, 20.70it/s]

 49%|████▊     | 486/1000 [00:23<00:24, 20.61it/s]

 49%|████▉     | 488/1000 [00:23<00:24, 20.58it/s]

 49%|████▉     | 490/1000 [00:23<00:24, 20.52it/s]

 49%|████▉     | 492/1000 [00:24<00:25, 20.29it/s]

 50%|████▉     | 495/1000 [00:24<00:24, 20.31it/s]

 50%|████▉     | 497/1000 [00:24<00:24, 20.31it/s]

 50%|█████     | 500/1000 [00:24<00:24, 20.24it/s]

 50%|█████     | 503/1000 [00:24<00:24, 20.25it/s]

 51%|█████     | 507/1000 [00:24<00:24, 20.32it/s]

 51%|█████     | 510/1000 [00:25<00:24, 20.34it/s]

 51%|█████▏    | 513/1000 [00:25<00:24, 20.29it/s]

 52%|█████▏    | 516/1000 [00:25<00:23, 20.24it/s]

 52%|█████▏    | 518/1000 [00:25<00:23, 20.24it/s]

 52%|█████▏    | 521/1000 [00:25<00:23, 20.26it/s]

 52%|█████▏    | 524/1000 [00:25<00:23, 20.28it/s]

 53%|█████▎    | 527/1000 [00:26<00:23, 20.21it/s]

 53%|█████▎    | 530/1000 [00:26<00:23, 20.24it/s]

 53%|█████▎    | 533/1000 [00:26<00:23, 20.24it/s]

 54%|█████▎    | 536/1000 [00:26<00:23, 20.13it/s]

 54%|█████▍    | 538/1000 [00:26<00:22, 20.10it/s]

 54%|█████▍    | 540/1000 [00:26<00:22, 20.08it/s]

 54%|█████▍    | 542/1000 [00:27<00:22, 20.04it/s]

 55%|█████▍    | 546/1000 [00:27<00:22, 20.06it/s]

 55%|█████▍    | 548/1000 [00:27<00:22, 20.06it/s]

 55%|█████▌    | 550/1000 [00:27<00:22, 20.04it/s]

 55%|█████▌    | 552/1000 [00:27<00:22, 20.01it/s]

 55%|█████▌    | 554/1000 [00:27<00:22, 19.97it/s]

 56%|█████▌    | 556/1000 [00:27<00:22, 19.96it/s]

 56%|█████▌    | 559/1000 [00:27<00:22, 19.99it/s]

 56%|█████▌    | 561/1000 [00:28<00:22, 19.95it/s]

 56%|█████▋    | 563/1000 [00:28<00:21, 19.94it/s]

 56%|█████▋    | 565/1000 [00:28<00:21, 19.93it/s]

 57%|█████▋    | 568/1000 [00:28<00:21, 19.94it/s]

 57%|█████▋    | 570/1000 [00:28<00:21, 19.72it/s]

 57%|█████▋    | 572/1000 [00:29<00:21, 19.66it/s]

 57%|█████▊    | 575/1000 [00:29<00:21, 19.68it/s]

 58%|█████▊    | 577/1000 [00:29<00:21, 19.61it/s]

 58%|█████▊    | 580/1000 [00:29<00:21, 19.59it/s]

 58%|█████▊    | 582/1000 [00:29<00:21, 19.56it/s]

 58%|█████▊    | 584/1000 [00:29<00:21, 19.53it/s]

 59%|█████▊    | 586/1000 [00:30<00:21, 19.50it/s]

 59%|█████▉    | 590/1000 [00:30<00:20, 19.56it/s]

 59%|█████▉    | 593/1000 [00:30<00:20, 19.47it/s]

 60%|█████▉    | 595/1000 [00:30<00:20, 19.41it/s]

 60%|█████▉    | 597/1000 [00:30<00:20, 19.39it/s]

 60%|█████▉    | 599/1000 [00:30<00:20, 19.37it/s]

 60%|██████    | 601/1000 [00:31<00:20, 19.37it/s]

 60%|██████    | 603/1000 [00:31<00:20, 19.31it/s]

 60%|██████    | 605/1000 [00:31<00:20, 19.28it/s]

 61%|██████    | 607/1000 [00:31<00:20, 19.20it/s]

 61%|██████    | 609/1000 [00:31<00:20, 19.14it/s]

 61%|██████    | 612/1000 [00:31<00:20, 19.15it/s]

 61%|██████▏   | 614/1000 [00:32<00:20, 19.14it/s]

 62%|██████▏   | 616/1000 [00:32<00:20, 19.14it/s]

 62%|██████▏   | 618/1000 [00:32<00:20, 19.09it/s]

 62%|██████▏   | 620/1000 [00:32<00:20, 18.99it/s]

 62%|██████▏   | 622/1000 [00:32<00:19, 18.96it/s]

 62%|██████▏   | 624/1000 [00:33<00:19, 18.86it/s]

 63%|██████▎   | 626/1000 [00:33<00:19, 18.85it/s]

 63%|██████▎   | 628/1000 [00:33<00:19, 18.79it/s]

 63%|██████▎   | 630/1000 [00:33<00:19, 18.74it/s]

 63%|██████▎   | 632/1000 [00:33<00:19, 18.64it/s]

 64%|██████▎   | 635/1000 [00:34<00:19, 18.63it/s]

 64%|██████▍   | 638/1000 [00:34<00:19, 18.62it/s]

 64%|██████▍   | 640/1000 [00:34<00:19, 18.62it/s]

 64%|██████▍   | 643/1000 [00:34<00:19, 18.63it/s]

 64%|██████▍   | 645/1000 [00:34<00:19, 18.63it/s]

 65%|██████▍   | 647/1000 [00:34<00:18, 18.60it/s]

 65%|██████▍   | 649/1000 [00:34<00:18, 18.55it/s]

 65%|██████▌   | 651/1000 [00:35<00:18, 18.51it/s]

 65%|██████▌   | 653/1000 [00:35<00:18, 18.47it/s]

 66%|██████▌   | 655/1000 [00:35<00:18, 18.35it/s]

 66%|██████▌   | 657/1000 [00:35<00:18, 18.30it/s]

 66%|██████▌   | 659/1000 [00:36<00:18, 18.24it/s]

 66%|██████▌   | 662/1000 [00:36<00:18, 18.24it/s]

 66%|██████▋   | 664/1000 [00:36<00:18, 18.16it/s]

 67%|██████▋   | 666/1000 [00:36<00:18, 18.13it/s]

 67%|██████▋   | 669/1000 [00:36<00:18, 18.15it/s]

 67%|██████▋   | 671/1000 [00:37<00:18, 18.11it/s]

 67%|██████▋   | 673/1000 [00:37<00:18, 18.11it/s]

 68%|██████▊   | 675/1000 [00:37<00:17, 18.08it/s]

 68%|██████▊   | 678/1000 [00:37<00:17, 18.10it/s]

 68%|██████▊   | 680/1000 [00:37<00:17, 18.07it/s]

 68%|██████▊   | 682/1000 [00:37<00:17, 17.98it/s]

 68%|██████▊   | 684/1000 [00:38<00:17, 17.93it/s]

 69%|██████▊   | 686/1000 [00:38<00:17, 17.91it/s]

 69%|██████▉   | 689/1000 [00:38<00:17, 17.94it/s]

 69%|██████▉   | 692/1000 [00:38<00:17, 17.93it/s]

 69%|██████▉   | 694/1000 [00:38<00:17, 17.91it/s]

 70%|██████▉   | 696/1000 [00:38<00:17, 17.86it/s]

 70%|██████▉   | 698/1000 [00:39<00:16, 17.81it/s]

 70%|███████   | 700/1000 [00:39<00:16, 17.79it/s]

 70%|███████   | 702/1000 [00:39<00:16, 17.72it/s]

 71%|███████   | 706/1000 [00:39<00:16, 17.73it/s]

 71%|███████   | 708/1000 [00:39<00:16, 17.71it/s]

 71%|███████   | 710/1000 [00:40<00:16, 17.67it/s]

 71%|███████   | 712/1000 [00:40<00:16, 17.65it/s]

 71%|███████▏  | 714/1000 [00:40<00:16, 17.64it/s]

 72%|███████▏  | 716/1000 [00:40<00:16, 17.63it/s]

 72%|███████▏  | 718/1000 [00:40<00:16, 17.57it/s]

 72%|███████▏  | 720/1000 [00:41<00:15, 17.52it/s]

 72%|███████▏  | 722/1000 [00:41<00:15, 17.49it/s]

 72%|███████▏  | 724/1000 [00:41<00:15, 17.46it/s]

 73%|███████▎  | 726/1000 [00:41<00:15, 17.39it/s]

 73%|███████▎  | 728/1000 [00:41<00:15, 17.38it/s]

 73%|███████▎  | 730/1000 [00:42<00:15, 17.30it/s]

 73%|███████▎  | 733/1000 [00:42<00:15, 17.32it/s]

 74%|███████▎  | 735/1000 [00:42<00:15, 17.32it/s]

 74%|███████▎  | 737/1000 [00:42<00:15, 17.26it/s]

 74%|███████▍  | 739/1000 [00:42<00:15, 17.22it/s]

 74%|███████▍  | 741/1000 [00:43<00:15, 17.20it/s]

 74%|███████▍  | 743/1000 [00:43<00:14, 17.21it/s]

 74%|███████▍  | 745/1000 [00:43<00:14, 17.21it/s]

 75%|███████▍  | 747/1000 [00:43<00:14, 17.14it/s]

 75%|███████▍  | 749/1000 [00:43<00:14, 17.08it/s]

 75%|███████▌  | 751/1000 [00:44<00:14, 17.05it/s]

 75%|███████▌  | 754/1000 [00:44<00:14, 17.03it/s]

 76%|███████▌  | 756/1000 [00:44<00:14, 17.04it/s]

 76%|███████▌  | 758/1000 [00:44<00:14, 17.04it/s]

 76%|███████▌  | 760/1000 [00:44<00:14, 16.99it/s]

 76%|███████▌  | 762/1000 [00:44<00:14, 16.97it/s]

 76%|███████▋  | 764/1000 [00:45<00:13, 16.94it/s]

 77%|███████▋  | 766/1000 [00:45<00:13, 16.91it/s]

 77%|███████▋  | 768/1000 [00:45<00:13, 16.88it/s]

 77%|███████▋  | 770/1000 [00:45<00:13, 16.88it/s]

 77%|███████▋  | 772/1000 [00:45<00:13, 16.86it/s]

 77%|███████▋  | 774/1000 [00:45<00:13, 16.83it/s]

 78%|███████▊  | 776/1000 [00:46<00:13, 16.82it/s]

 78%|███████▊  | 779/1000 [00:46<00:13, 16.79it/s]

 78%|███████▊  | 781/1000 [00:46<00:13, 16.78it/s]

 78%|███████▊  | 783/1000 [00:46<00:12, 16.77it/s]

 78%|███████▊  | 785/1000 [00:46<00:12, 16.73it/s]

 79%|███████▊  | 787/1000 [00:47<00:12, 16.73it/s]

 79%|███████▉  | 790/1000 [00:47<00:12, 16.72it/s]

 79%|███████▉  | 792/1000 [00:47<00:12, 16.71it/s]

 80%|███████▉  | 795/1000 [00:47<00:12, 16.74it/s]

 80%|███████▉  | 798/1000 [00:47<00:12, 16.69it/s]

 80%|████████  | 800/1000 [00:47<00:11, 16.67it/s]

 80%|████████  | 802/1000 [00:48<00:11, 16.66it/s]

 80%|████████  | 804/1000 [00:48<00:11, 16.61it/s]

 81%|████████  | 806/1000 [00:48<00:11, 16.62it/s]

 81%|████████  | 809/1000 [00:48<00:11, 16.64it/s]

 81%|████████  | 811/1000 [00:48<00:11, 16.64it/s]

 81%|████████▏ | 813/1000 [00:48<00:11, 16.64it/s]

 82%|████████▏ | 815/1000 [00:49<00:11, 16.62it/s]

 82%|████████▏ | 817/1000 [00:49<00:11, 16.55it/s]

 82%|████████▏ | 820/1000 [00:49<00:10, 16.53it/s]

 82%|████████▏ | 822/1000 [00:49<00:10, 16.45it/s]

 82%|████████▏ | 824/1000 [00:50<00:10, 16.45it/s]

 83%|████████▎ | 826/1000 [00:50<00:10, 16.42it/s]

 83%|████████▎ | 828/1000 [00:50<00:10, 16.39it/s]

 83%|████████▎ | 830/1000 [00:50<00:10, 16.34it/s]

 83%|████████▎ | 832/1000 [00:51<00:10, 16.30it/s]

 83%|████████▎ | 833/1000 [00:51<00:10, 16.23it/s]

 84%|████████▎ | 835/1000 [00:51<00:10, 16.22it/s]

 84%|████████▎ | 837/1000 [00:51<00:10, 16.21it/s]

 84%|████████▍ | 840/1000 [00:51<00:09, 16.20it/s]

 84%|████████▍ | 842/1000 [00:51<00:09, 16.20it/s]

 84%|████████▍ | 844/1000 [00:52<00:09, 16.16it/s]

 85%|████████▍ | 846/1000 [00:52<00:09, 16.13it/s]

 85%|████████▍ | 848/1000 [00:52<00:09, 16.06it/s]

 85%|████████▍ | 849/1000 [00:52<00:09, 16.03it/s]

 85%|████████▌ | 852/1000 [00:53<00:09, 16.04it/s]

 85%|████████▌ | 854/1000 [00:53<00:09, 15.97it/s]

 86%|████████▌ | 855/1000 [00:53<00:09, 15.94it/s]

 86%|████████▌ | 856/1000 [00:53<00:09, 15.89it/s]

 86%|████████▌ | 857/1000 [00:54<00:09, 15.85it/s]

 86%|████████▌ | 859/1000 [00:54<00:08, 15.85it/s]

 86%|████████▌ | 860/1000 [00:54<00:08, 15.82it/s]

 86%|████████▋ | 863/1000 [00:54<00:08, 15.82it/s]

 86%|████████▋ | 865/1000 [00:54<00:08, 15.82it/s]

 87%|████████▋ | 867/1000 [00:54<00:08, 15.78it/s]

 87%|████████▋ | 869/1000 [00:55<00:08, 15.76it/s]

 87%|████████▋ | 871/1000 [00:55<00:08, 15.72it/s]

 87%|████████▋ | 873/1000 [00:55<00:08, 15.72it/s]

 88%|████████▊ | 875/1000 [00:55<00:07, 15.72it/s]

 88%|████████▊ | 877/1000 [00:55<00:07, 15.68it/s]

 88%|████████▊ | 879/1000 [00:56<00:07, 15.68it/s]

 88%|████████▊ | 881/1000 [00:56<00:07, 15.66it/s]

 88%|████████▊ | 883/1000 [00:56<00:07, 15.65it/s]

 88%|████████▊ | 885/1000 [00:56<00:07, 15.60it/s]

 89%|████████▊ | 887/1000 [00:57<00:07, 15.55it/s]

 89%|████████▉ | 888/1000 [00:57<00:07, 15.53it/s]

 89%|████████▉ | 890/1000 [00:57<00:07, 15.52it/s]

 89%|████████▉ | 892/1000 [00:57<00:06, 15.52it/s]

 89%|████████▉ | 894/1000 [00:57<00:06, 15.52it/s]

 90%|████████▉ | 896/1000 [00:57<00:06, 15.52it/s]

 90%|████████▉ | 898/1000 [00:57<00:06, 15.51it/s]

 90%|█████████ | 900/1000 [00:58<00:06, 15.50it/s]

 90%|█████████ | 902/1000 [00:58<00:06, 15.50it/s]

 90%|█████████ | 904/1000 [00:58<00:06, 15.47it/s]

 91%|█████████ | 906/1000 [00:58<00:06, 15.39it/s]

 91%|█████████ | 907/1000 [00:59<00:06, 15.37it/s]

 91%|█████████ | 909/1000 [00:59<00:05, 15.35it/s]

 91%|█████████ | 911/1000 [00:59<00:05, 15.33it/s]

 91%|█████████▏| 913/1000 [00:59<00:05, 15.30it/s]

 91%|█████████▏| 914/1000 [00:59<00:05, 15.27it/s]

 92%|█████████▏| 915/1000 [01:00<00:05, 15.25it/s]

 92%|█████████▏| 917/1000 [01:00<00:05, 15.22it/s]

 92%|█████████▏| 918/1000 [01:00<00:05, 15.17it/s]

 92%|█████████▏| 919/1000 [01:00<00:05, 15.14it/s]

 92%|█████████▏| 921/1000 [01:00<00:05, 15.11it/s]

 92%|█████████▏| 922/1000 [01:01<00:05, 15.09it/s]

 92%|█████████▏| 923/1000 [01:01<00:05, 15.08it/s]

 92%|█████████▏| 924/1000 [01:01<00:05, 15.05it/s]

 92%|█████████▎| 925/1000 [01:01<00:04, 15.02it/s]

 93%|█████████▎| 927/1000 [01:01<00:04, 14.99it/s]

 93%|█████████▎| 929/1000 [01:02<00:04, 14.96it/s]

 93%|█████████▎| 930/1000 [01:02<00:04, 14.94it/s]

 93%|█████████▎| 933/1000 [01:02<00:04, 14.94it/s]

 93%|█████████▎| 934/1000 [01:02<00:04, 14.92it/s]

 94%|█████████▎| 935/1000 [01:02<00:04, 14.88it/s]

 94%|█████████▎| 936/1000 [01:02<00:04, 14.87it/s]

 94%|█████████▍| 938/1000 [01:03<00:04, 14.86it/s]

 94%|█████████▍| 939/1000 [01:03<00:04, 14.84it/s]

 94%|█████████▍| 940/1000 [01:03<00:04, 14.79it/s]

 94%|█████████▍| 941/1000 [01:03<00:04, 14.75it/s]

 94%|█████████▍| 942/1000 [01:03<00:03, 14.73it/s]

 94%|█████████▍| 943/1000 [01:04<00:03, 14.72it/s]

 94%|█████████▍| 944/1000 [01:04<00:03, 14.71it/s]

 94%|█████████▍| 945/1000 [01:04<00:03, 14.66it/s]

 95%|█████████▍| 946/1000 [01:04<00:03, 14.65it/s]

 95%|█████████▍| 947/1000 [01:04<00:03, 14.63it/s]

 95%|█████████▍| 948/1000 [01:04<00:03, 14.60it/s]

 95%|█████████▍| 949/1000 [01:05<00:03, 14.55it/s]

 95%|█████████▌| 951/1000 [01:05<00:03, 14.54it/s]

 95%|█████████▌| 952/1000 [01:05<00:03, 14.54it/s]

 95%|█████████▌| 953/1000 [01:05<00:03, 14.51it/s]

 96%|█████████▌| 955/1000 [01:05<00:03, 14.50it/s]

 96%|█████████▌| 956/1000 [01:05<00:03, 14.49it/s]

 96%|█████████▌| 958/1000 [01:06<00:02, 14.46it/s]

 96%|█████████▌| 959/1000 [01:06<00:02, 14.44it/s]

 96%|█████████▌| 961/1000 [01:06<00:02, 14.39it/s]

 96%|█████████▋| 963/1000 [01:07<00:02, 14.36it/s]

 96%|█████████▋| 964/1000 [01:07<00:02, 14.32it/s]

 96%|█████████▋| 965/1000 [01:07<00:02, 14.30it/s]

 97%|█████████▋| 967/1000 [01:07<00:02, 14.26it/s]

 97%|█████████▋| 968/1000 [01:07<00:02, 14.26it/s]

 97%|█████████▋| 970/1000 [01:08<00:02, 14.25it/s]

 97%|█████████▋| 971/1000 [01:08<00:02, 14.21it/s]

 97%|█████████▋| 972/1000 [01:08<00:01, 14.20it/s]

 97%|█████████▋| 973/1000 [01:08<00:01, 14.17it/s]

 98%|█████████▊| 975/1000 [01:08<00:01, 14.16it/s]

 98%|█████████▊| 976/1000 [01:09<00:01, 14.12it/s]

 98%|█████████▊| 977/1000 [01:09<00:01, 14.10it/s]

 98%|█████████▊| 978/1000 [01:09<00:01, 14.07it/s]

 98%|█████████▊| 980/1000 [01:09<00:01, 14.04it/s]

 98%|█████████▊| 981/1000 [01:09<00:01, 14.02it/s]

 98%|█████████▊| 982/1000 [01:10<00:01, 13.99it/s]

 98%|█████████▊| 983/1000 [01:10<00:01, 13.97it/s]

 98%|█████████▊| 984/1000 [01:10<00:01, 13.95it/s]

 99%|█████████▊| 986/1000 [01:10<00:01, 13.94it/s]

 99%|█████████▊| 987/1000 [01:10<00:00, 13.93it/s]

 99%|█████████▉| 988/1000 [01:11<00:00, 13.91it/s]

 99%|█████████▉| 990/1000 [01:11<00:00, 13.91it/s]

 99%|█████████▉| 991/1000 [01:11<00:00, 13.88it/s]

 99%|█████████▉| 992/1000 [01:11<00:00, 13.87it/s]

 99%|█████████▉| 993/1000 [01:11<00:00, 13.84it/s]

 99%|█████████▉| 994/1000 [01:12<00:00, 13.80it/s]

100%|█████████▉| 995/1000 [01:12<00:00, 13.79it/s]

100%|█████████▉| 996/1000 [01:12<00:00, 13.77it/s]

100%|█████████▉| 998/1000 [01:12<00:00, 13.74it/s]

100%|█████████▉| 999/1000 [01:12<00:00, 13.73it/s]

100%|██████████| 1000/1000 [01:12<00:00, 13.72it/s]




In [0]:
# You may want to run a video of the trained agent performing in the environment using the env.render() function.
#
# for i in range(3):
#
#     ob = env.reset()
#
#     while True:
#
#         action, policy = agent.act(ob, reward, done)
#
#         ob, reward, done, _ = env.step(action[0])
#
#         if done:
#             break
#       
#         env.render()

2: Plot the cumulative reward for both RandomAgent and REINFORCEAgent.