In [1]:
import ale_py
import gym
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import tqdm
import imageio
import random
import tensorflow as tf
import os

In [2]:
env = gym.make("PongDeterministic-v4")

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [3]:
def preprocess_frame(image):
    image = image[35:195:2, ::2, 0]
    image = torch.from_numpy(image).float()
    image[image == 144] = 0
    image[image == 109] = 0
    image[image != 0] = 1
    return image.flatten()


In [4]:
class PolicyNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(80*80, 200),
            nn.ReLU(),
            nn.Linear(200, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

In [5]:
def play_episode(policy,env):
    UP = 2
    DOWN = 3
    observation, _ = env.reset()    
    prev_state = None
    state = preprocess_frame(observation)
    probs = []
    rewards = []
    while True:
        if prev_state is not None:
            change = state - prev_state
        else:
            change = state
        prob = policy(change).squeeze(0)
        action = UP if random.random() < prob.item() else DOWN
        observation, reward, terminated, truncated, _ = env.step(action)
        prev_state = state
        state = preprocess_frame(observation)
        probs.append(prob if action == UP else 1 - prob)
        rewards.append(reward)
        if terminated or truncated:
            break
    return torch.stack(probs), torch.tensor(rewards)

In [6]:
def calculate_returns(rewards,gamma):
    discounted_r = []
    running_add = 0
    for reward in reversed(rewards):
        if reward != 0:
            running_add = 0
        running_add = running_add * gamma + reward
        discounted_r.append(running_add)
    return torch.tensor(discounted_r[::-1])

In [7]:
def train_policy(
    policy,
    env,
    gamma,
    epochs=1000,
    batch_size=4,
    n_save_epochs=50,
    checkpoint_path="checkpoint.pth",
    load_checkpoint=True,
):
    writer = tf.summary.create_file_writer("logs")
    writer.set_as_default()

    if load_checkpoint:
        if os.path.exists(checkpoint_path):
            policy.load_state_dict(torch.load(checkpoint_path))
        else:
            print(f"Checkpoint {checkpoint_path} not found")
            
    optimizer = torch.optim.Adam(policy.parameters(), lr=7e-4)
    policy.train()
    for epoch in tqdm.tqdm(range(epochs)):
        mean_batch_loss = 0
        mean_batch_reward = 0
        for _ in range(batch_size):
            probs, rewards = play_episode(policy, env)
            returns = calculate_returns(rewards, gamma)
            returns = (returns - returns.mean()) / returns.std()
            loss = -(returns * torch.log(probs)).sum()
            mean_batch_loss += loss / batch_size
            mean_batch_reward += rewards.sum() / batch_size
        optimizer.zero_grad()
        mean_batch_loss.backward()
        optimizer.step()

        print(
            f"Epoch: {epoch}, mean loss: {mean_batch_loss:.2f}, "
            f"mean reward: {mean_batch_reward:.2f}"
        )
        tf.summary.scalar('mean loss', mean_batch_loss.detach().item(), step=epoch)
        tf.summary.scalar('mean reward', mean_batch_reward.detach().item(), step=epoch)

        if epoch % n_save_epochs == 0:
            torch.save(policy.state_dict(), checkpoint_path)
            print(f"Checkpoint for {epoch} epoch saved to {checkpoint_path}")

In [8]:
policy = PolicyNetwork()
train_policy(
    policy,
    env,
    gamma=0.99,
    epochs=2000,
    batch_size=4,
    n_save_epochs=50,
    checkpoint_path="checkpoint.pth",
    load_checkpoint=True,
)

Checkpoint checkpoint.pth not found


  if not isinstance(terminated, (bool, np.bool8)):
  0%|          | 1/2000 [00:05<3:19:32,  5.99s/it]

Epoch: 0, mean loss: 0.12, mean reward: -20.50


  0%|          | 2/2000 [00:12<3:24:33,  6.14s/it]

Epoch: 1, mean loss: -0.02, mean reward: -21.00


  0%|          | 3/2000 [00:18<3:21:44,  6.06s/it]

Epoch: 2, mean loss: -0.51, mean reward: -20.75


  0%|          | 4/2000 [00:24<3:26:34,  6.21s/it]

Epoch: 3, mean loss: -0.51, mean reward: -20.50


  0%|          | 5/2000 [00:30<3:21:30,  6.06s/it]

Epoch: 4, mean loss: -0.31, mean reward: -21.00


  0%|          | 6/2000 [00:37<3:33:44,  6.43s/it]

Epoch: 5, mean loss: -1.46, mean reward: -20.25


  0%|          | 7/2000 [00:43<3:32:39,  6.40s/it]

Epoch: 6, mean loss: -1.76, mean reward: -20.50


  0%|          | 8/2000 [00:50<3:29:56,  6.32s/it]

Epoch: 7, mean loss: 0.18, mean reward: -21.00


  0%|          | 9/2000 [00:55<3:20:55,  6.06s/it]

Epoch: 8, mean loss: 0.13, mean reward: -20.50


  0%|          | 10/2000 [01:01<3:22:52,  6.12s/it]

Epoch: 9, mean loss: -1.22, mean reward: -20.00


  1%|          | 11/2000 [01:08<3:30:50,  6.36s/it]

Epoch: 10, mean loss: -1.21, mean reward: -20.50


  1%|          | 12/2000 [01:15<3:30:18,  6.35s/it]

Epoch: 11, mean loss: -0.82, mean reward: -20.25


  1%|          | 13/2000 [01:21<3:29:54,  6.34s/it]

Epoch: 12, mean loss: -0.95, mean reward: -20.25


  1%|          | 14/2000 [01:28<3:34:59,  6.50s/it]

Epoch: 13, mean loss: -0.16, mean reward: -19.00


  1%|          | 15/2000 [01:34<3:35:19,  6.51s/it]

Epoch: 14, mean loss: -1.86, mean reward: -20.25


  1%|          | 16/2000 [01:41<3:34:36,  6.49s/it]

Epoch: 15, mean loss: -3.44, mean reward: -20.50


  1%|          | 17/2000 [01:47<3:32:54,  6.44s/it]

Epoch: 16, mean loss: -1.77, mean reward: -20.25


  1%|          | 18/2000 [01:54<3:40:57,  6.69s/it]

Epoch: 17, mean loss: -2.58, mean reward: -19.25


  1%|          | 19/2000 [02:01<3:37:55,  6.60s/it]

Epoch: 18, mean loss: 0.94, mean reward: -20.50


  1%|          | 20/2000 [02:07<3:33:19,  6.46s/it]

Epoch: 19, mean loss: 1.47, mean reward: -20.50


  1%|          | 21/2000 [02:13<3:29:03,  6.34s/it]

Epoch: 20, mean loss: 0.28, mean reward: -20.25


  1%|          | 22/2000 [02:19<3:23:13,  6.16s/it]

Epoch: 21, mean loss: -2.60, mean reward: -20.75


  1%|          | 23/2000 [02:25<3:25:07,  6.23s/it]

Epoch: 22, mean loss: -1.41, mean reward: -20.25


  1%|          | 24/2000 [02:31<3:23:02,  6.17s/it]

Epoch: 23, mean loss: 1.55, mean reward: -20.75


  1%|▏         | 25/2000 [02:37<3:24:24,  6.21s/it]

Epoch: 24, mean loss: -2.44, mean reward: -20.00


  1%|▏         | 26/2000 [02:43<3:22:23,  6.15s/it]

Epoch: 25, mean loss: -0.71, mean reward: -20.75


  1%|▏         | 27/2000 [02:50<3:29:38,  6.38s/it]

Epoch: 26, mean loss: -1.32, mean reward: -20.00


  1%|▏         | 28/2000 [02:57<3:33:56,  6.51s/it]

Epoch: 27, mean loss: -1.33, mean reward: -20.25


  1%|▏         | 29/2000 [03:04<3:40:25,  6.71s/it]

Epoch: 28, mean loss: 2.88, mean reward: -19.00


  2%|▏         | 30/2000 [03:11<3:42:46,  6.78s/it]

Epoch: 29, mean loss: -2.10, mean reward: -20.50


  2%|▏         | 31/2000 [03:18<3:42:38,  6.78s/it]

Epoch: 30, mean loss: 0.52, mean reward: -20.00


  2%|▏         | 32/2000 [03:24<3:37:41,  6.64s/it]

Epoch: 31, mean loss: -0.77, mean reward: -20.25


  2%|▏         | 33/2000 [03:31<3:33:48,  6.52s/it]

Epoch: 32, mean loss: -4.07, mean reward: -20.75


  2%|▏         | 34/2000 [03:37<3:29:42,  6.40s/it]

Epoch: 33, mean loss: -0.80, mean reward: -21.00


  2%|▏         | 35/2000 [03:43<3:31:58,  6.47s/it]

Epoch: 34, mean loss: -1.45, mean reward: -20.50


  2%|▏         | 36/2000 [03:50<3:35:26,  6.58s/it]

Epoch: 35, mean loss: -5.51, mean reward: -20.50


  2%|▏         | 37/2000 [03:56<3:32:08,  6.48s/it]

Epoch: 36, mean loss: -2.91, mean reward: -20.50


  2%|▏         | 38/2000 [04:03<3:29:34,  6.41s/it]

Epoch: 37, mean loss: -6.21, mean reward: -20.00


  2%|▏         | 39/2000 [04:10<3:40:04,  6.73s/it]

Epoch: 38, mean loss: -5.09, mean reward: -20.00


  2%|▏         | 40/2000 [04:17<3:39:58,  6.73s/it]

Epoch: 39, mean loss: -6.39, mean reward: -20.25


  2%|▏         | 41/2000 [04:23<3:35:07,  6.59s/it]

Epoch: 40, mean loss: 1.00, mean reward: -20.50


  2%|▏         | 42/2000 [04:30<3:34:01,  6.56s/it]

Epoch: 41, mean loss: -7.89, mean reward: -20.25


  2%|▏         | 43/2000 [04:37<3:40:07,  6.75s/it]

Epoch: 42, mean loss: -0.36, mean reward: -20.00


  2%|▏         | 44/2000 [04:44<3:41:53,  6.81s/it]

Epoch: 43, mean loss: -3.52, mean reward: -19.50


  2%|▏         | 45/2000 [04:51<3:41:51,  6.81s/it]

Epoch: 44, mean loss: -2.12, mean reward: -20.25


  2%|▏         | 46/2000 [04:58<3:50:11,  7.07s/it]

Epoch: 45, mean loss: -4.33, mean reward: -20.25


  2%|▏         | 47/2000 [05:05<3:44:38,  6.90s/it]

Epoch: 46, mean loss: 1.32, mean reward: -20.75


  2%|▏         | 48/2000 [05:12<3:52:24,  7.14s/it]

Epoch: 47, mean loss: -12.99, mean reward: -20.00


  2%|▏         | 49/2000 [05:20<3:56:19,  7.27s/it]

Epoch: 48, mean loss: -0.55, mean reward: -19.50


  2%|▎         | 50/2000 [05:27<3:56:00,  7.26s/it]

Epoch: 49, mean loss: -3.01, mean reward: -19.25


  3%|▎         | 51/2000 [05:34<3:52:45,  7.17s/it]

Epoch: 50, mean loss: -3.11, mean reward: -19.75


  3%|▎         | 52/2000 [05:41<3:49:31,  7.07s/it]

Epoch: 51, mean loss: 0.20, mean reward: -20.75


  3%|▎         | 53/2000 [05:49<3:56:43,  7.29s/it]

Epoch: 52, mean loss: 2.13, mean reward: -20.50


  3%|▎         | 54/2000 [05:56<3:57:25,  7.32s/it]

Epoch: 53, mean loss: -14.49, mean reward: -18.75


  3%|▎         | 55/2000 [06:04<3:56:49,  7.31s/it]

Epoch: 54, mean loss: -8.11, mean reward: -20.25


  3%|▎         | 56/2000 [06:12<4:07:26,  7.64s/it]

Epoch: 55, mean loss: -12.14, mean reward: -20.25


  3%|▎         | 57/2000 [06:19<3:57:11,  7.32s/it]

Epoch: 56, mean loss: 0.05, mean reward: -20.50


  3%|▎         | 58/2000 [06:25<3:52:57,  7.20s/it]

Epoch: 57, mean loss: -6.69, mean reward: -19.75


  3%|▎         | 59/2000 [06:33<3:54:13,  7.24s/it]

Epoch: 58, mean loss: -5.68, mean reward: -20.25


  3%|▎         | 60/2000 [06:41<4:04:21,  7.56s/it]

Epoch: 59, mean loss: -0.81, mean reward: -19.00


  3%|▎         | 61/2000 [06:47<3:51:39,  7.17s/it]

Epoch: 60, mean loss: -4.67, mean reward: -20.50


  3%|▎         | 62/2000 [06:54<3:50:56,  7.15s/it]

Epoch: 61, mean loss: -11.22, mean reward: -20.00


  3%|▎         | 63/2000 [07:02<3:56:14,  7.32s/it]

Epoch: 62, mean loss: -19.55, mean reward: -20.50


  3%|▎         | 64/2000 [07:09<3:56:11,  7.32s/it]

Epoch: 63, mean loss: -8.44, mean reward: -20.25


  3%|▎         | 65/2000 [07:17<3:53:54,  7.25s/it]

Epoch: 64, mean loss: -5.68, mean reward: -19.75


  3%|▎         | 66/2000 [07:24<3:53:50,  7.25s/it]

Epoch: 65, mean loss: -0.55, mean reward: -20.25


  3%|▎         | 67/2000 [07:31<3:56:16,  7.33s/it]

Epoch: 66, mean loss: -10.27, mean reward: -20.50


  3%|▎         | 68/2000 [07:38<3:45:00,  6.99s/it]

Epoch: 67, mean loss: -3.97, mean reward: -19.75


  3%|▎         | 69/2000 [07:45<3:48:35,  7.10s/it]

Epoch: 68, mean loss: -13.65, mean reward: -19.00


  4%|▎         | 70/2000 [07:52<3:51:39,  7.20s/it]

Epoch: 69, mean loss: -9.22, mean reward: -19.50


  4%|▎         | 71/2000 [08:00<3:56:35,  7.36s/it]

Epoch: 70, mean loss: -13.44, mean reward: -18.25


  4%|▎         | 72/2000 [08:07<3:48:55,  7.12s/it]

Epoch: 71, mean loss: -3.63, mean reward: -18.75


  4%|▎         | 73/2000 [08:14<3:51:15,  7.20s/it]

Epoch: 72, mean loss: -2.26, mean reward: -18.25


  4%|▎         | 74/2000 [08:21<3:46:29,  7.06s/it]

Epoch: 73, mean loss: -25.78, mean reward: -19.50


  4%|▍         | 75/2000 [08:27<3:42:23,  6.93s/it]

Epoch: 74, mean loss: -5.17, mean reward: -19.75


  4%|▍         | 76/2000 [08:35<3:48:26,  7.12s/it]

Epoch: 75, mean loss: -0.79, mean reward: -18.50


  4%|▍         | 77/2000 [08:43<3:53:46,  7.29s/it]

Epoch: 76, mean loss: -16.68, mean reward: -19.25


  4%|▍         | 78/2000 [08:51<4:00:19,  7.50s/it]

Epoch: 77, mean loss: -22.75, mean reward: -19.25


  4%|▍         | 79/2000 [08:57<3:53:39,  7.30s/it]

Epoch: 78, mean loss: -0.63, mean reward: -19.50


  4%|▍         | 80/2000 [09:04<3:47:09,  7.10s/it]

Epoch: 79, mean loss: -6.35, mean reward: -20.00


  4%|▍         | 81/2000 [09:11<3:45:37,  7.05s/it]

Epoch: 80, mean loss: -9.19, mean reward: -20.50


  4%|▍         | 82/2000 [09:20<4:02:51,  7.60s/it]

Epoch: 81, mean loss: 0.29, mean reward: -18.75


  4%|▍         | 83/2000 [09:28<4:05:01,  7.67s/it]

Epoch: 82, mean loss: -16.40, mean reward: -19.50


  4%|▍         | 84/2000 [09:36<4:11:33,  7.88s/it]

Epoch: 83, mean loss: -4.96, mean reward: -19.50


  4%|▍         | 85/2000 [09:45<4:19:43,  8.14s/it]

Epoch: 84, mean loss: 3.29, mean reward: -19.25


  4%|▍         | 86/2000 [09:53<4:24:28,  8.29s/it]

Epoch: 85, mean loss: -21.20, mean reward: -20.00


  4%|▍         | 87/2000 [10:02<4:31:27,  8.51s/it]

Epoch: 86, mean loss: -13.94, mean reward: -18.50


  4%|▍         | 88/2000 [10:11<4:33:01,  8.57s/it]

Epoch: 87, mean loss: -8.24, mean reward: -19.25


  4%|▍         | 89/2000 [10:19<4:23:12,  8.26s/it]

Epoch: 88, mean loss: -11.19, mean reward: -19.50


  4%|▍         | 90/2000 [10:28<4:28:15,  8.43s/it]

Epoch: 89, mean loss: -1.91, mean reward: -17.25


  5%|▍         | 91/2000 [10:37<4:37:31,  8.72s/it]

Epoch: 90, mean loss: -13.40, mean reward: -20.00


  5%|▍         | 92/2000 [10:45<4:33:32,  8.60s/it]

Epoch: 91, mean loss: -7.27, mean reward: -20.50


  5%|▍         | 93/2000 [10:54<4:30:32,  8.51s/it]

Epoch: 92, mean loss: -19.25, mean reward: -20.25


  5%|▍         | 94/2000 [11:02<4:25:57,  8.37s/it]

Epoch: 93, mean loss: -4.04, mean reward: -20.00


  5%|▍         | 95/2000 [11:09<4:19:30,  8.17s/it]

Epoch: 94, mean loss: 3.05, mean reward: -19.50


  5%|▍         | 96/2000 [11:19<4:34:14,  8.64s/it]

Epoch: 95, mean loss: 0.16, mean reward: -18.00


  5%|▍         | 97/2000 [11:30<4:52:25,  9.22s/it]

Epoch: 96, mean loss: -18.23, mean reward: -18.00


  5%|▍         | 98/2000 [11:38<4:47:44,  9.08s/it]

Epoch: 97, mean loss: -7.31, mean reward: -20.00


  5%|▍         | 99/2000 [11:48<4:51:35,  9.20s/it]

Epoch: 98, mean loss: -7.25, mean reward: -20.25


  5%|▌         | 100/2000 [11:57<4:53:05,  9.26s/it]

Epoch: 99, mean loss: -13.27, mean reward: -18.00


  5%|▌         | 101/2000 [12:05<4:42:16,  8.92s/it]

Epoch: 100, mean loss: -17.91, mean reward: -18.50


  5%|▌         | 102/2000 [12:14<4:35:38,  8.71s/it]

Epoch: 101, mean loss: -5.45, mean reward: -20.50


  5%|▌         | 103/2000 [12:21<4:22:12,  8.29s/it]

Epoch: 102, mean loss: 0.79, mean reward: -18.75


  5%|▌         | 104/2000 [12:30<4:25:31,  8.40s/it]

Epoch: 103, mean loss: -18.74, mean reward: -18.75


  5%|▌         | 105/2000 [12:39<4:35:21,  8.72s/it]

Epoch: 104, mean loss: 5.16, mean reward: -17.75


  5%|▌         | 106/2000 [12:48<4:33:03,  8.65s/it]

Epoch: 105, mean loss: -11.58, mean reward: -18.00


  5%|▌         | 107/2000 [12:57<4:39:17,  8.85s/it]

Epoch: 106, mean loss: -15.50, mean reward: -17.75


  5%|▌         | 108/2000 [13:07<4:47:17,  9.11s/it]

Epoch: 107, mean loss: -4.45, mean reward: -18.50


  5%|▌         | 109/2000 [13:16<4:47:47,  9.13s/it]

Epoch: 108, mean loss: 0.76, mean reward: -19.25


  6%|▌         | 110/2000 [13:25<4:48:03,  9.14s/it]

Epoch: 109, mean loss: -1.81, mean reward: -18.25


  6%|▌         | 111/2000 [13:34<4:43:03,  8.99s/it]

Epoch: 110, mean loss: -17.88, mean reward: -19.25


  6%|▌         | 112/2000 [13:43<4:46:28,  9.10s/it]

Epoch: 111, mean loss: -8.82, mean reward: -19.00


  6%|▌         | 113/2000 [13:52<4:49:38,  9.21s/it]

Epoch: 112, mean loss: -7.64, mean reward: -18.50


  6%|▌         | 114/2000 [14:01<4:48:17,  9.17s/it]

Epoch: 113, mean loss: -4.95, mean reward: -18.50


  6%|▌         | 115/2000 [14:11<4:48:34,  9.19s/it]

Epoch: 114, mean loss: -14.96, mean reward: -18.50


  6%|▌         | 116/2000 [14:20<4:52:46,  9.32s/it]

Epoch: 115, mean loss: -10.80, mean reward: -18.75


  6%|▌         | 117/2000 [14:29<4:49:36,  9.23s/it]

Epoch: 116, mean loss: -10.95, mean reward: -19.25


  6%|▌         | 118/2000 [14:39<4:55:27,  9.42s/it]

Epoch: 117, mean loss: -6.17, mean reward: -18.75


  6%|▌         | 119/2000 [14:49<4:57:55,  9.50s/it]

Epoch: 118, mean loss: -8.92, mean reward: -18.75


  6%|▌         | 120/2000 [14:59<5:06:59,  9.80s/it]

Epoch: 119, mean loss: -10.60, mean reward: -18.25


  6%|▌         | 121/2000 [15:12<5:35:06, 10.70s/it]

Epoch: 120, mean loss: -26.26, mean reward: -16.50


  6%|▌         | 122/2000 [15:23<5:36:34, 10.75s/it]

Epoch: 121, mean loss: -5.33, mean reward: -18.25


  6%|▌         | 123/2000 [15:33<5:25:50, 10.42s/it]

Epoch: 122, mean loss: -9.27, mean reward: -19.25


  6%|▌         | 124/2000 [15:44<5:34:13, 10.69s/it]

Epoch: 123, mean loss: -16.75, mean reward: -18.00


  6%|▋         | 125/2000 [15:54<5:31:07, 10.60s/it]

Epoch: 124, mean loss: -11.30, mean reward: -18.75


  6%|▋         | 126/2000 [16:04<5:22:33, 10.33s/it]

Epoch: 125, mean loss: -2.45, mean reward: -19.25


  6%|▋         | 127/2000 [16:14<5:16:57, 10.15s/it]

Epoch: 126, mean loss: -17.58, mean reward: -18.00


  6%|▋         | 128/2000 [16:24<5:17:03, 10.16s/it]

Epoch: 127, mean loss: 9.39, mean reward: -17.75


  6%|▋         | 129/2000 [16:34<5:12:55, 10.04s/it]

Epoch: 128, mean loss: -0.83, mean reward: -18.00


  6%|▋         | 130/2000 [16:43<5:09:19,  9.92s/it]

Epoch: 129, mean loss: 6.32, mean reward: -20.25


  7%|▋         | 131/2000 [16:53<5:03:34,  9.75s/it]

Epoch: 130, mean loss: -13.33, mean reward: -19.00


  7%|▋         | 132/2000 [17:01<4:52:46,  9.40s/it]

Epoch: 131, mean loss: -3.00, mean reward: -19.25


  7%|▋         | 133/2000 [17:12<4:59:37,  9.63s/it]

Epoch: 132, mean loss: -20.51, mean reward: -18.25


  7%|▋         | 134/2000 [17:21<5:00:15,  9.65s/it]

Epoch: 133, mean loss: 3.26, mean reward: -17.00


  7%|▋         | 135/2000 [17:30<4:55:54,  9.52s/it]

Epoch: 134, mean loss: -15.26, mean reward: -17.75


  7%|▋         | 136/2000 [17:41<5:01:17,  9.70s/it]

Epoch: 135, mean loss: -12.50, mean reward: -18.50


  7%|▋         | 137/2000 [17:49<4:46:33,  9.23s/it]

Epoch: 136, mean loss: -20.63, mean reward: -19.25


  7%|▋         | 138/2000 [17:59<4:51:42,  9.40s/it]

Epoch: 137, mean loss: -23.92, mean reward: -19.25


  7%|▋         | 139/2000 [18:08<4:54:15,  9.49s/it]

Epoch: 138, mean loss: -11.70, mean reward: -18.00


  7%|▋         | 140/2000 [18:17<4:51:57,  9.42s/it]

Epoch: 139, mean loss: -9.56, mean reward: -17.50


  7%|▋         | 141/2000 [18:27<4:54:28,  9.50s/it]

Epoch: 140, mean loss: 0.92, mean reward: -16.75


  7%|▋         | 142/2000 [18:36<4:45:57,  9.23s/it]

Epoch: 141, mean loss: -17.43, mean reward: -19.00


  7%|▋         | 143/2000 [18:45<4:48:35,  9.32s/it]

Epoch: 142, mean loss: -5.66, mean reward: -17.25


  7%|▋         | 144/2000 [18:55<4:48:57,  9.34s/it]

Epoch: 143, mean loss: -12.12, mean reward: -19.00


  7%|▋         | 145/2000 [19:05<4:53:50,  9.50s/it]

Epoch: 144, mean loss: -10.56, mean reward: -18.75


  7%|▋         | 146/2000 [19:14<4:48:48,  9.35s/it]

Epoch: 145, mean loss: -18.42, mean reward: -18.50


  7%|▋         | 147/2000 [19:22<4:44:27,  9.21s/it]

Epoch: 146, mean loss: -13.67, mean reward: -18.00


  7%|▋         | 148/2000 [19:32<4:45:29,  9.25s/it]

Epoch: 147, mean loss: -12.12, mean reward: -17.50


  7%|▋         | 149/2000 [19:41<4:41:47,  9.13s/it]

Epoch: 148, mean loss: -0.34, mean reward: -17.25


  8%|▊         | 150/2000 [19:51<4:52:15,  9.48s/it]

Epoch: 149, mean loss: -19.69, mean reward: -18.25


  8%|▊         | 151/2000 [19:59<4:42:07,  9.16s/it]

Epoch: 150, mean loss: -12.79, mean reward: -17.50


  8%|▊         | 152/2000 [20:09<4:47:03,  9.32s/it]

Epoch: 151, mean loss: -11.55, mean reward: -18.25


  8%|▊         | 153/2000 [20:19<4:54:14,  9.56s/it]

Epoch: 152, mean loss: -6.77, mean reward: -15.75


  8%|▊         | 154/2000 [20:28<4:50:31,  9.44s/it]

Epoch: 153, mean loss: -34.20, mean reward: -17.50


  8%|▊         | 155/2000 [20:38<4:48:59,  9.40s/it]

Epoch: 154, mean loss: -18.07, mean reward: -17.50


  8%|▊         | 156/2000 [20:49<5:04:08,  9.90s/it]

Epoch: 155, mean loss: -18.81, mean reward: -18.00


  8%|▊         | 157/2000 [20:59<5:09:56, 10.09s/it]

Epoch: 156, mean loss: -4.31, mean reward: -18.25


  8%|▊         | 158/2000 [21:11<5:25:18, 10.60s/it]

Epoch: 157, mean loss: -8.51, mean reward: -16.00


  8%|▊         | 159/2000 [21:23<5:33:30, 10.87s/it]

Epoch: 158, mean loss: -15.60, mean reward: -16.25


  8%|▊         | 160/2000 [21:34<5:39:46, 11.08s/it]

Epoch: 159, mean loss: -17.64, mean reward: -16.50


  8%|▊         | 161/2000 [21:45<5:37:27, 11.01s/it]

Epoch: 160, mean loss: -44.25, mean reward: -15.50


  8%|▊         | 162/2000 [21:56<5:33:58, 10.90s/it]

Epoch: 161, mean loss: -13.06, mean reward: -17.50


  8%|▊         | 163/2000 [22:06<5:26:37, 10.67s/it]

Epoch: 162, mean loss: -26.07, mean reward: -17.75


  8%|▊         | 164/2000 [22:16<5:22:23, 10.54s/it]

Epoch: 163, mean loss: -9.49, mean reward: -17.25


  8%|▊         | 165/2000 [22:29<5:45:23, 11.29s/it]

Epoch: 164, mean loss: -8.26, mean reward: -15.50


  8%|▊         | 166/2000 [22:38<5:24:02, 10.60s/it]

Epoch: 165, mean loss: -12.17, mean reward: -18.50


  8%|▊         | 167/2000 [22:51<5:45:46, 11.32s/it]

Epoch: 166, mean loss: -35.53, mean reward: -15.50


  8%|▊         | 168/2000 [23:02<5:41:34, 11.19s/it]

Epoch: 167, mean loss: -34.08, mean reward: -16.00


  8%|▊         | 169/2000 [23:23<7:10:33, 14.11s/it]

Epoch: 168, mean loss: -44.29, mean reward: -13.50


  8%|▊         | 170/2000 [23:35<6:56:26, 13.65s/it]

Epoch: 169, mean loss: -33.23, mean reward: -16.00


  9%|▊         | 171/2000 [23:49<6:52:08, 13.52s/it]

Epoch: 170, mean loss: -26.81, mean reward: -15.25


  9%|▊         | 172/2000 [24:00<6:32:59, 12.90s/it]

Epoch: 171, mean loss: -17.76, mean reward: -17.25


  9%|▊         | 173/2000 [24:12<6:23:30, 12.59s/it]

Epoch: 172, mean loss: -26.89, mean reward: -17.00


  9%|▊         | 174/2000 [24:24<6:15:16, 12.33s/it]

Epoch: 173, mean loss: -36.80, mean reward: -15.00


  9%|▉         | 175/2000 [24:34<6:01:05, 11.87s/it]

Epoch: 174, mean loss: -15.43, mean reward: -17.50


  9%|▉         | 176/2000 [24:46<6:00:51, 11.87s/it]

Epoch: 175, mean loss: -15.24, mean reward: -15.75


  9%|▉         | 177/2000 [25:00<6:21:59, 12.57s/it]

Epoch: 176, mean loss: -24.46, mean reward: -16.75


  9%|▉         | 178/2000 [25:12<6:11:53, 12.25s/it]

Epoch: 177, mean loss: -7.97, mean reward: -17.75


  9%|▉         | 179/2000 [25:24<6:06:04, 12.06s/it]

Epoch: 178, mean loss: -28.91, mean reward: -14.25


  9%|▉         | 180/2000 [25:36<6:04:35, 12.02s/it]

Epoch: 179, mean loss: -23.92, mean reward: -17.00


  9%|▉         | 181/2000 [25:50<6:23:33, 12.65s/it]

Epoch: 180, mean loss: -13.51, mean reward: -13.75


  9%|▉         | 182/2000 [26:04<6:38:05, 13.14s/it]

Epoch: 181, mean loss: -26.14, mean reward: -12.75


  9%|▉         | 183/2000 [26:16<6:30:28, 12.89s/it]

Epoch: 182, mean loss: -8.28, mean reward: -16.50


  9%|▉         | 184/2000 [26:28<6:19:23, 12.54s/it]

Epoch: 183, mean loss: -37.80, mean reward: -17.00


  9%|▉         | 185/2000 [26:39<6:07:37, 12.15s/it]

Epoch: 184, mean loss: -12.63, mean reward: -17.75


  9%|▉         | 186/2000 [26:53<6:25:51, 12.76s/it]

Epoch: 185, mean loss: -30.08, mean reward: -16.50


  9%|▉         | 187/2000 [27:05<6:12:56, 12.34s/it]

Epoch: 186, mean loss: -30.92, mean reward: -16.75


  9%|▉         | 188/2000 [27:17<6:09:34, 12.24s/it]

Epoch: 187, mean loss: -30.17, mean reward: -15.50


  9%|▉         | 189/2000 [27:28<5:58:59, 11.89s/it]

Epoch: 188, mean loss: -6.88, mean reward: -15.50


 10%|▉         | 190/2000 [27:40<6:00:09, 11.94s/it]

Epoch: 189, mean loss: -9.90, mean reward: -15.50


 10%|▉         | 191/2000 [27:51<5:53:37, 11.73s/it]

Epoch: 190, mean loss: -35.87, mean reward: -15.50


 10%|▉         | 192/2000 [28:03<5:54:16, 11.76s/it]

Epoch: 191, mean loss: -43.82, mean reward: -16.00


 10%|▉         | 193/2000 [28:15<5:54:31, 11.77s/it]

Epoch: 192, mean loss: -12.93, mean reward: -14.75


 10%|▉         | 194/2000 [28:30<6:24:12, 12.76s/it]

Epoch: 193, mean loss: -35.26, mean reward: -14.75


 10%|▉         | 195/2000 [28:41<6:10:11, 12.31s/it]

Epoch: 194, mean loss: -27.47, mean reward: -17.75


 10%|▉         | 196/2000 [28:58<6:47:51, 13.57s/it]

Epoch: 195, mean loss: -15.29, mean reward: -15.00


 10%|▉         | 197/2000 [29:10<6:37:17, 13.22s/it]

Epoch: 196, mean loss: -36.92, mean reward: -14.00


 10%|▉         | 198/2000 [29:23<6:32:53, 13.08s/it]

Epoch: 197, mean loss: -23.28, mean reward: -13.75


 10%|▉         | 199/2000 [29:36<6:31:40, 13.05s/it]

Epoch: 198, mean loss: -23.46, mean reward: -13.50


 10%|█         | 200/2000 [29:48<6:28:32, 12.95s/it]

Epoch: 199, mean loss: -52.56, mean reward: -15.25


 10%|█         | 201/2000 [30:02<6:32:56, 13.11s/it]

Epoch: 200, mean loss: -32.43, mean reward: -15.75


 10%|█         | 202/2000 [30:18<6:57:23, 13.93s/it]

Epoch: 201, mean loss: -34.68, mean reward: -11.50


 10%|█         | 203/2000 [30:32<6:58:08, 13.96s/it]

Epoch: 202, mean loss: -16.75, mean reward: -13.25


 10%|█         | 204/2000 [30:44<6:45:28, 13.55s/it]

Epoch: 203, mean loss: -29.69, mean reward: -13.25


 10%|█         | 205/2000 [30:58<6:45:31, 13.55s/it]

Epoch: 204, mean loss: -24.47, mean reward: -13.00


 10%|█         | 206/2000 [31:10<6:35:36, 13.23s/it]

Epoch: 205, mean loss: -56.17, mean reward: -14.25


 10%|█         | 207/2000 [31:24<6:34:15, 13.19s/it]

Epoch: 206, mean loss: -24.90, mean reward: -13.75


 10%|█         | 208/2000 [31:37<6:39:30, 13.38s/it]

Epoch: 207, mean loss: -40.76, mean reward: -12.50


 10%|█         | 209/2000 [31:50<6:33:48, 13.19s/it]

Epoch: 208, mean loss: -63.14, mean reward: -15.00


 10%|█         | 210/2000 [32:04<6:44:19, 13.55s/it]

Epoch: 209, mean loss: -36.36, mean reward: -13.00


 11%|█         | 211/2000 [32:17<6:34:52, 13.24s/it]

Epoch: 210, mean loss: -23.43, mean reward: -13.75


 11%|█         | 212/2000 [32:29<6:25:13, 12.93s/it]

Epoch: 211, mean loss: -25.28, mean reward: -15.25


 11%|█         | 213/2000 [32:43<6:34:13, 13.24s/it]

Epoch: 212, mean loss: -27.62, mean reward: -13.50


 11%|█         | 214/2000 [32:57<6:40:52, 13.47s/it]

Epoch: 213, mean loss: -44.35, mean reward: -13.25


 11%|█         | 215/2000 [33:10<6:35:42, 13.30s/it]

Epoch: 214, mean loss: -25.96, mean reward: -16.00


 11%|█         | 216/2000 [33:23<6:34:26, 13.27s/it]

Epoch: 215, mean loss: -70.46, mean reward: -14.25


 11%|█         | 217/2000 [33:36<6:32:35, 13.21s/it]

Epoch: 216, mean loss: -19.49, mean reward: -12.50


 11%|█         | 218/2000 [33:49<6:29:35, 13.12s/it]

Epoch: 217, mean loss: -44.80, mean reward: -12.00


 11%|█         | 219/2000 [34:01<6:18:11, 12.74s/it]

Epoch: 218, mean loss: -29.63, mean reward: -13.00


 11%|█         | 220/2000 [34:14<6:15:18, 12.65s/it]

Epoch: 219, mean loss: -43.37, mean reward: -14.25


 11%|█         | 221/2000 [34:25<6:02:36, 12.23s/it]

Epoch: 220, mean loss: -50.58, mean reward: -13.25


 11%|█         | 222/2000 [34:38<6:11:52, 12.55s/it]

Epoch: 221, mean loss: -51.58, mean reward: -12.50


 11%|█         | 223/2000 [34:49<5:57:45, 12.08s/it]

Epoch: 222, mean loss: -37.61, mean reward: -14.50


 11%|█         | 224/2000 [35:03<6:11:47, 12.56s/it]

Epoch: 223, mean loss: -34.24, mean reward: -10.50


 11%|█▏        | 225/2000 [35:16<6:18:46, 12.80s/it]

Epoch: 224, mean loss: -31.31, mean reward: -11.75


 11%|█▏        | 226/2000 [35:35<7:16:26, 14.76s/it]

Epoch: 225, mean loss: -38.77, mean reward: -11.25


 11%|█▏        | 227/2000 [35:54<7:53:08, 16.01s/it]

Epoch: 226, mean loss: -24.62, mean reward: -13.25


 11%|█▏        | 228/2000 [36:24<9:53:14, 20.09s/it]

Epoch: 227, mean loss: -39.80, mean reward: -13.25


 11%|█▏        | 229/2000 [36:41<9:26:07, 19.18s/it]

Epoch: 228, mean loss: -46.33, mean reward: -11.25


 12%|█▏        | 230/2000 [36:56<8:49:49, 17.96s/it]

Epoch: 229, mean loss: -55.37, mean reward: -13.50


 12%|█▏        | 231/2000 [37:12<8:30:59, 17.33s/it]

Epoch: 230, mean loss: -32.17, mean reward: -10.50


 12%|█▏        | 232/2000 [37:30<8:32:05, 17.38s/it]

Epoch: 231, mean loss: -38.31, mean reward: -12.00


 12%|█▏        | 233/2000 [37:45<8:14:09, 16.78s/it]

Epoch: 232, mean loss: -45.40, mean reward: -14.00


 12%|█▏        | 234/2000 [38:00<8:00:39, 16.33s/it]

Epoch: 233, mean loss: -57.64, mean reward: -11.25


 12%|█▏        | 235/2000 [38:19<8:25:22, 17.18s/it]

Epoch: 234, mean loss: -63.71, mean reward: -12.50


 12%|█▏        | 236/2000 [38:36<8:24:41, 17.17s/it]

Epoch: 235, mean loss: -55.00, mean reward: -11.50


 12%|█▏        | 237/2000 [38:53<8:17:09, 16.92s/it]

Epoch: 236, mean loss: -42.08, mean reward: -13.25


 12%|█▏        | 238/2000 [39:10<8:15:08, 16.86s/it]

Epoch: 237, mean loss: -41.84, mean reward: -10.25


 12%|█▏        | 239/2000 [39:26<8:15:14, 16.87s/it]

Epoch: 238, mean loss: -40.17, mean reward: -12.00


 12%|█▏        | 240/2000 [39:43<8:10:53, 16.73s/it]

Epoch: 239, mean loss: -28.75, mean reward: -9.00


 12%|█▏        | 241/2000 [39:59<8:09:25, 16.69s/it]

Epoch: 240, mean loss: -39.82, mean reward: -12.75


 12%|█▏        | 242/2000 [40:15<8:00:59, 16.42s/it]

Epoch: 241, mean loss: -47.92, mean reward: -12.75


 12%|█▏        | 243/2000 [40:32<8:07:37, 16.65s/it]

Epoch: 242, mean loss: -46.23, mean reward: -8.75


 12%|█▏        | 244/2000 [40:50<8:18:29, 17.03s/it]

Epoch: 243, mean loss: -51.09, mean reward: -8.50


 12%|█▏        | 245/2000 [41:07<8:14:29, 16.91s/it]

Epoch: 244, mean loss: -42.79, mean reward: -11.00


 12%|█▏        | 246/2000 [41:26<8:29:01, 17.41s/it]

Epoch: 245, mean loss: -50.42, mean reward: -8.75


 12%|█▏        | 247/2000 [41:42<8:19:06, 17.08s/it]

Epoch: 246, mean loss: -48.43, mean reward: -11.75


 12%|█▏        | 248/2000 [42:00<8:26:59, 17.36s/it]

Epoch: 247, mean loss: -54.35, mean reward: -7.75


 12%|█▏        | 249/2000 [42:17<8:26:57, 17.37s/it]

Epoch: 248, mean loss: -60.43, mean reward: -12.75


 12%|█▎        | 250/2000 [42:35<8:30:32, 17.50s/it]

Epoch: 249, mean loss: -44.88, mean reward: -10.25


 13%|█▎        | 251/2000 [42:54<8:46:28, 18.06s/it]

Epoch: 250, mean loss: -78.74, mean reward: -7.75


 13%|█▎        | 252/2000 [43:09<8:16:43, 17.05s/it]

Epoch: 251, mean loss: -49.03, mean reward: -12.00


 13%|█▎        | 253/2000 [43:32<9:06:46, 18.78s/it]

Epoch: 252, mean loss: -64.42, mean reward: -8.25


 13%|█▎        | 254/2000 [43:51<9:06:08, 18.77s/it]

Epoch: 253, mean loss: -73.71, mean reward: -10.00


 13%|█▎        | 255/2000 [44:08<8:55:06, 18.40s/it]

Epoch: 254, mean loss: -58.49, mean reward: -11.25


 13%|█▎        | 256/2000 [44:26<8:50:52, 18.26s/it]

Epoch: 255, mean loss: -65.10, mean reward: -9.00


 13%|█▎        | 257/2000 [44:46<9:07:40, 18.85s/it]

Epoch: 256, mean loss: -75.87, mean reward: -7.50


 13%|█▎        | 258/2000 [45:07<9:24:15, 19.43s/it]

Epoch: 257, mean loss: -58.58, mean reward: -8.25


 13%|█▎        | 259/2000 [45:26<9:21:31, 19.35s/it]

Epoch: 258, mean loss: -40.56, mean reward: -9.00


 13%|█▎        | 260/2000 [45:44<9:08:16, 18.91s/it]

Epoch: 259, mean loss: -75.98, mean reward: -8.50


 13%|█▎        | 261/2000 [46:03<9:06:10, 18.84s/it]

Epoch: 260, mean loss: -66.02, mean reward: -6.25


 13%|█▎        | 262/2000 [46:20<8:49:24, 18.28s/it]

Epoch: 261, mean loss: -54.82, mean reward: -11.00


 13%|█▎        | 263/2000 [46:38<8:47:05, 18.21s/it]

Epoch: 262, mean loss: -85.22, mean reward: -9.25


 13%|█▎        | 264/2000 [46:56<8:49:26, 18.30s/it]

Epoch: 263, mean loss: -66.60, mean reward: -8.75


 13%|█▎        | 265/2000 [47:14<8:46:35, 18.21s/it]

Epoch: 264, mean loss: -80.20, mean reward: -8.25


 13%|█▎        | 266/2000 [47:37<9:23:18, 19.49s/it]

Epoch: 265, mean loss: -71.34, mean reward: -3.50


 13%|█▎        | 267/2000 [47:57<9:24:04, 19.53s/it]

Epoch: 266, mean loss: -94.42, mean reward: -8.50


 13%|█▎        | 268/2000 [48:16<9:22:26, 19.48s/it]

Epoch: 267, mean loss: -76.06, mean reward: -6.25


 13%|█▎        | 269/2000 [48:37<9:34:02, 19.90s/it]

Epoch: 268, mean loss: -83.14, mean reward: -8.75


 14%|█▎        | 270/2000 [49:00<10:02:19, 20.89s/it]

Epoch: 269, mean loss: -92.77, mean reward: -5.00


 14%|█▎        | 271/2000 [49:19<9:48:20, 20.42s/it] 

Epoch: 270, mean loss: -42.22, mean reward: -10.25


 14%|█▎        | 272/2000 [49:43<10:13:28, 21.30s/it]

Epoch: 271, mean loss: -82.32, mean reward: -5.50


 14%|█▎        | 273/2000 [50:06<10:33:10, 22.00s/it]

Epoch: 272, mean loss: -82.80, mean reward: -5.00


 14%|█▎        | 274/2000 [50:28<10:33:15, 22.01s/it]

Epoch: 273, mean loss: -86.25, mean reward: -7.75


 14%|█▍        | 275/2000 [50:50<10:29:03, 21.88s/it]

Epoch: 274, mean loss: -84.10, mean reward: -1.50


 14%|█▍        | 276/2000 [51:09<10:05:11, 21.06s/it]

Epoch: 275, mean loss: -80.73, mean reward: -8.50


 14%|█▍        | 277/2000 [51:28<9:43:29, 20.32s/it] 

Epoch: 276, mean loss: -78.01, mean reward: -7.25


 14%|█▍        | 278/2000 [51:46<9:26:36, 19.74s/it]

Epoch: 277, mean loss: -38.36, mean reward: -7.75


 14%|█▍        | 279/2000 [52:04<9:08:46, 19.13s/it]

Epoch: 278, mean loss: -81.08, mean reward: -10.00


 14%|█▍        | 280/2000 [52:24<9:19:45, 19.53s/it]

Epoch: 279, mean loss: -90.65, mean reward: -2.00


 14%|█▍        | 281/2000 [52:43<9:14:49, 19.37s/it]

Epoch: 280, mean loss: -60.27, mean reward: -6.50


 14%|█▍        | 282/2000 [53:05<9:33:15, 20.02s/it]

Epoch: 281, mean loss: -82.57, mean reward: -1.00


 14%|█▍        | 283/2000 [53:23<9:16:05, 19.43s/it]

Epoch: 282, mean loss: -60.52, mean reward: -8.50


 14%|█▍        | 284/2000 [53:42<9:14:05, 19.37s/it]

Epoch: 283, mean loss: -88.18, mean reward: -8.75


 14%|█▍        | 285/2000 [54:01<9:12:07, 19.32s/it]

Epoch: 284, mean loss: -90.80, mean reward: -6.00


 14%|█▍        | 286/2000 [54:23<9:34:24, 20.11s/it]

Epoch: 285, mean loss: -102.21, mean reward: -0.75


 14%|█▍        | 287/2000 [54:46<9:59:38, 21.00s/it]

Epoch: 286, mean loss: -62.71, mean reward: -7.00


 14%|█▍        | 288/2000 [55:07<9:59:17, 21.00s/it]

Epoch: 287, mean loss: -68.06, mean reward: -0.50


 14%|█▍        | 289/2000 [55:29<10:03:48, 21.17s/it]

Epoch: 288, mean loss: -77.92, mean reward: -6.25


 14%|█▍        | 290/2000 [55:51<10:11:16, 21.45s/it]

Epoch: 289, mean loss: -70.75, mean reward: -5.00


 15%|█▍        | 291/2000 [56:11<9:55:38, 20.91s/it] 

Epoch: 290, mean loss: -64.45, mean reward: -4.00


 15%|█▍        | 292/2000 [56:34<10:13:34, 21.55s/it]

Epoch: 291, mean loss: -78.41, mean reward: -2.00


 15%|█▍        | 293/2000 [56:53<9:57:29, 21.00s/it] 

Epoch: 292, mean loss: -69.37, mean reward: -9.50


 15%|█▍        | 294/2000 [57:15<10:05:47, 21.31s/it]

Epoch: 293, mean loss: -85.36, mean reward: -0.50


 15%|█▍        | 295/2000 [57:36<9:56:38, 21.00s/it] 

Epoch: 294, mean loss: -117.11, mean reward: -0.25


 15%|█▍        | 296/2000 [57:57<9:59:06, 21.10s/it]

Epoch: 295, mean loss: -40.50, mean reward: -5.25


 15%|█▍        | 297/2000 [58:18<9:57:37, 21.06s/it]

Epoch: 296, mean loss: -69.72, mean reward: -2.50


 15%|█▍        | 298/2000 [58:39<10:01:24, 21.20s/it]

Epoch: 297, mean loss: -52.73, mean reward: -2.00


 15%|█▍        | 299/2000 [59:01<10:07:14, 21.42s/it]

Epoch: 298, mean loss: -56.88, mean reward: 0.50


 15%|█▌        | 300/2000 [59:25<10:26:52, 22.12s/it]

Epoch: 299, mean loss: -59.85, mean reward: -5.50


 15%|█▌        | 301/2000 [59:57<11:52:48, 25.17s/it]

Epoch: 300, mean loss: -71.51, mean reward: -1.00


 15%|█▌        | 302/2000 [1:00:21<11:39:23, 24.71s/it]

Epoch: 301, mean loss: -66.47, mean reward: -1.75


 15%|█▌        | 303/2000 [1:00:42<11:06:25, 23.56s/it]

Epoch: 302, mean loss: -103.85, mean reward: -2.75


 15%|█▌        | 304/2000 [1:01:02<10:38:54, 22.60s/it]

Epoch: 303, mean loss: -79.81, mean reward: 0.50


 15%|█▌        | 305/2000 [1:01:25<10:37:33, 22.57s/it]

Epoch: 304, mean loss: -101.41, mean reward: -2.75


 15%|█▌        | 306/2000 [1:01:46<10:25:41, 22.16s/it]

Epoch: 305, mean loss: -76.79, mean reward: 2.00


 15%|█▌        | 307/2000 [1:02:06<10:10:18, 21.63s/it]

Epoch: 306, mean loss: -79.94, mean reward: 5.00


 15%|█▌        | 308/2000 [1:02:28<10:12:22, 21.72s/it]

Epoch: 307, mean loss: -114.17, mean reward: -2.25


 15%|█▌        | 309/2000 [1:02:51<10:22:03, 22.07s/it]

Epoch: 308, mean loss: -67.90, mean reward: 0.00


 16%|█▌        | 310/2000 [1:03:16<10:41:47, 22.79s/it]

Epoch: 309, mean loss: -76.68, mean reward: -1.25


 16%|█▌        | 311/2000 [1:03:38<10:40:13, 22.74s/it]

Epoch: 310, mean loss: -83.55, mean reward: 4.50


 16%|█▌        | 312/2000 [1:04:00<10:30:25, 22.41s/it]

Epoch: 311, mean loss: -77.70, mean reward: -3.25


 16%|█▌        | 313/2000 [1:04:24<10:40:09, 22.77s/it]

Epoch: 312, mean loss: -124.28, mean reward: 0.00


 16%|█▌        | 314/2000 [1:04:47<10:43:39, 22.91s/it]

Epoch: 313, mean loss: -99.37, mean reward: 0.25


 16%|█▌        | 315/2000 [1:05:13<11:07:03, 23.75s/it]

Epoch: 314, mean loss: -135.49, mean reward: -1.25


 16%|█▌        | 316/2000 [1:05:38<11:18:00, 24.16s/it]

Epoch: 315, mean loss: -99.54, mean reward: 0.25


 16%|█▌        | 317/2000 [1:06:01<11:12:20, 23.97s/it]

Epoch: 316, mean loss: -90.19, mean reward: 3.50


 16%|█▌        | 318/2000 [1:06:25<11:09:23, 23.88s/it]

Epoch: 317, mean loss: -97.63, mean reward: 3.25


 16%|█▌        | 319/2000 [1:06:47<10:51:38, 23.26s/it]

Epoch: 318, mean loss: -78.44, mean reward: 2.00


 16%|█▌        | 320/2000 [1:07:09<10:39:58, 22.86s/it]

Epoch: 319, mean loss: -103.91, mean reward: 4.00


 16%|█▌        | 321/2000 [1:07:30<10:24:31, 22.32s/it]

Epoch: 320, mean loss: -92.69, mean reward: 6.75


 16%|█▌        | 322/2000 [1:07:56<10:59:35, 23.59s/it]

Epoch: 321, mean loss: -58.32, mean reward: 5.00


 16%|█▌        | 323/2000 [1:08:17<10:40:05, 22.90s/it]

Epoch: 322, mean loss: -91.66, mean reward: 5.00


 16%|█▌        | 324/2000 [1:08:38<10:23:07, 22.31s/it]

Epoch: 323, mean loss: -111.82, mean reward: 7.50


 16%|█▋        | 325/2000 [1:09:01<10:23:44, 22.34s/it]

Epoch: 324, mean loss: -112.89, mean reward: 6.25


 16%|█▋        | 326/2000 [1:09:26<10:43:34, 23.07s/it]

Epoch: 325, mean loss: -97.47, mean reward: 5.00


 16%|█▋        | 327/2000 [1:09:49<10:46:30, 23.19s/it]

Epoch: 326, mean loss: -82.20, mean reward: 3.00


 16%|█▋        | 328/2000 [1:10:14<11:01:27, 23.74s/it]

Epoch: 327, mean loss: -111.32, mean reward: 11.50


 16%|█▋        | 329/2000 [1:10:38<10:59:52, 23.69s/it]

Epoch: 328, mean loss: -90.85, mean reward: 7.25


 16%|█▋        | 330/2000 [1:11:03<11:16:29, 24.31s/it]

Epoch: 329, mean loss: -95.10, mean reward: 2.50


 17%|█▋        | 331/2000 [1:11:27<11:12:01, 24.16s/it]

Epoch: 330, mean loss: -58.44, mean reward: 3.00


 17%|█▋        | 332/2000 [1:11:54<11:35:58, 25.04s/it]

Epoch: 331, mean loss: -79.41, mean reward: 4.50


 17%|█▋        | 333/2000 [1:12:24<12:16:41, 26.52s/it]

Epoch: 332, mean loss: -72.42, mean reward: 3.25


 17%|█▋        | 334/2000 [1:12:53<12:31:08, 27.05s/it]

Epoch: 333, mean loss: -89.50, mean reward: 2.25


 17%|█▋        | 335/2000 [1:13:19<12:25:29, 26.86s/it]

Epoch: 334, mean loss: -77.39, mean reward: 5.75


 17%|█▋        | 336/2000 [1:13:43<12:03:12, 26.08s/it]

Epoch: 335, mean loss: -79.04, mean reward: 6.00


 17%|█▋        | 337/2000 [1:14:09<12:01:18, 26.02s/it]

Epoch: 336, mean loss: -114.85, mean reward: 5.75


 17%|█▋        | 338/2000 [1:14:32<11:35:22, 25.10s/it]

Epoch: 337, mean loss: -98.30, mean reward: 7.75


 17%|█▋        | 339/2000 [1:14:57<11:29:42, 24.91s/it]

Epoch: 338, mean loss: -47.92, mean reward: 6.75


 17%|█▋        | 340/2000 [1:15:20<11:15:08, 24.40s/it]

Epoch: 339, mean loss: -67.53, mean reward: 6.75


 17%|█▋        | 341/2000 [1:15:42<10:59:54, 23.87s/it]

Epoch: 340, mean loss: -116.31, mean reward: 6.75


 17%|█▋        | 342/2000 [1:16:03<10:34:14, 22.95s/it]

Epoch: 341, mean loss: -124.44, mean reward: 9.00


 17%|█▋        | 343/2000 [1:16:26<10:35:20, 23.01s/it]

Epoch: 342, mean loss: -113.80, mean reward: 7.00


 17%|█▋        | 344/2000 [1:16:45<10:00:32, 21.76s/it]

Epoch: 343, mean loss: -108.97, mean reward: 9.75


 17%|█▋        | 345/2000 [1:17:04<9:39:54, 21.02s/it] 

Epoch: 344, mean loss: -66.46, mean reward: 12.50


 17%|█▋        | 346/2000 [1:17:28<9:57:27, 21.67s/it]

Epoch: 345, mean loss: -114.77, mean reward: 7.00


 17%|█▋        | 347/2000 [1:17:50<10:02:44, 21.88s/it]

Epoch: 346, mean loss: -108.01, mean reward: 7.25


 17%|█▋        | 348/2000 [1:18:11<9:53:57, 21.57s/it] 

Epoch: 347, mean loss: -94.53, mean reward: 8.75


 17%|█▋        | 349/2000 [1:18:32<9:49:47, 21.43s/it]

Epoch: 348, mean loss: -102.50, mean reward: 10.75


 18%|█▊        | 350/2000 [1:18:55<10:02:37, 21.91s/it]

Epoch: 349, mean loss: -93.26, mean reward: 7.00


 18%|█▊        | 351/2000 [1:19:18<10:12:33, 22.29s/it]

Epoch: 350, mean loss: -115.30, mean reward: 9.25


 18%|█▊        | 352/2000 [1:19:39<10:02:28, 21.93s/it]

Epoch: 351, mean loss: -94.12, mean reward: 10.00


 18%|█▊        | 353/2000 [1:19:57<9:30:13, 20.77s/it] 

Epoch: 352, mean loss: -127.22, mean reward: 11.50


 18%|█▊        | 354/2000 [1:20:17<9:19:59, 20.41s/it]

Epoch: 353, mean loss: -106.15, mean reward: 11.75


 18%|█▊        | 355/2000 [1:20:39<9:37:15, 21.06s/it]

Epoch: 354, mean loss: -88.85, mean reward: 8.75


 18%|█▊        | 356/2000 [1:21:01<9:44:18, 21.33s/it]

Epoch: 355, mean loss: -135.80, mean reward: 8.50


 18%|█▊        | 357/2000 [1:21:21<9:26:08, 20.67s/it]

Epoch: 356, mean loss: -85.09, mean reward: 11.25


 18%|█▊        | 358/2000 [1:21:42<9:33:47, 20.97s/it]

Epoch: 357, mean loss: -93.09, mean reward: 7.00


 18%|█▊        | 359/2000 [1:22:02<9:24:32, 20.64s/it]

Epoch: 358, mean loss: -116.52, mean reward: 10.00


 18%|█▊        | 360/2000 [1:22:23<9:27:42, 20.77s/it]

Epoch: 359, mean loss: -122.58, mean reward: 8.25


 18%|█▊        | 361/2000 [1:22:42<9:11:51, 20.20s/it]

Epoch: 360, mean loss: -90.52, mean reward: 12.25


 18%|█▊        | 362/2000 [1:22:59<8:45:42, 19.26s/it]

Epoch: 361, mean loss: -92.02, mean reward: 13.75


 18%|█▊        | 363/2000 [1:23:16<8:22:38, 18.42s/it]

Epoch: 362, mean loss: -66.93, mean reward: 15.25


 18%|█▊        | 364/2000 [1:23:36<8:37:52, 18.99s/it]

Epoch: 363, mean loss: -82.01, mean reward: 6.50


 18%|█▊        | 365/2000 [1:23:54<8:33:11, 18.83s/it]

Epoch: 364, mean loss: -88.04, mean reward: 13.75


 18%|█▊        | 366/2000 [1:24:16<8:52:33, 19.56s/it]

Epoch: 365, mean loss: -92.51, mean reward: 10.00


 18%|█▊        | 367/2000 [1:24:38<9:13:59, 20.36s/it]

Epoch: 366, mean loss: -123.03, mean reward: 8.50


 18%|█▊        | 368/2000 [1:24:58<9:15:59, 20.44s/it]

Epoch: 367, mean loss: -95.44, mean reward: 13.75


 18%|█▊        | 369/2000 [1:25:19<9:19:06, 20.57s/it]

Epoch: 368, mean loss: -92.81, mean reward: 13.75


 18%|█▊        | 370/2000 [1:25:38<9:01:55, 19.95s/it]

Epoch: 369, mean loss: -84.96, mean reward: 10.75


 19%|█▊        | 371/2000 [1:26:00<9:22:28, 20.72s/it]

Epoch: 370, mean loss: -100.05, mean reward: 13.50


 19%|█▊        | 372/2000 [1:26:17<8:52:31, 19.63s/it]

Epoch: 371, mean loss: -80.58, mean reward: 14.75


 19%|█▊        | 373/2000 [1:26:35<8:36:56, 19.06s/it]

Epoch: 372, mean loss: -121.16, mean reward: 15.00


 19%|█▊        | 374/2000 [1:26:52<8:19:02, 18.41s/it]

Epoch: 373, mean loss: -63.58, mean reward: 15.00


 19%|█▉        | 375/2000 [1:27:11<8:25:14, 18.66s/it]

Epoch: 374, mean loss: -85.82, mean reward: 13.00


 19%|█▉        | 376/2000 [1:27:28<8:05:36, 17.94s/it]

Epoch: 375, mean loss: -77.55, mean reward: 16.50


 19%|█▉        | 377/2000 [1:27:46<8:13:09, 18.23s/it]

Epoch: 376, mean loss: -154.51, mean reward: 12.25


 19%|█▉        | 378/2000 [1:28:06<8:22:28, 18.59s/it]

Epoch: 377, mean loss: -63.21, mean reward: 14.25


 19%|█▉        | 379/2000 [1:28:22<7:59:10, 17.74s/it]

Epoch: 378, mean loss: -50.81, mean reward: 17.25


 19%|█▉        | 380/2000 [1:28:39<7:53:39, 17.54s/it]

Epoch: 379, mean loss: -78.72, mean reward: 16.25


 19%|█▉        | 381/2000 [1:29:08<9:27:35, 21.03s/it]

Epoch: 380, mean loss: -86.42, mean reward: 15.50


 19%|█▉        | 382/2000 [1:29:27<9:07:16, 20.29s/it]

Epoch: 381, mean loss: -63.09, mean reward: 16.00


 19%|█▉        | 383/2000 [1:29:45<8:52:46, 19.77s/it]

Epoch: 382, mean loss: -56.30, mean reward: 16.50


 19%|█▉        | 384/2000 [1:30:05<8:56:32, 19.92s/it]

Epoch: 383, mean loss: -114.50, mean reward: 15.00


 19%|█▉        | 385/2000 [1:30:22<8:32:26, 19.04s/it]

Epoch: 384, mean loss: -112.25, mean reward: 15.00


 19%|█▉        | 386/2000 [1:30:42<8:34:13, 19.12s/it]

Epoch: 385, mean loss: -86.09, mean reward: 15.50


 19%|█▉        | 387/2000 [1:30:59<8:18:49, 18.56s/it]

Epoch: 386, mean loss: -74.55, mean reward: 16.75


 19%|█▉        | 388/2000 [1:31:17<8:11:29, 18.29s/it]

Epoch: 387, mean loss: -90.08, mean reward: 15.25


 19%|█▉        | 389/2000 [1:31:35<8:10:02, 18.25s/it]

Epoch: 388, mean loss: -84.85, mean reward: 12.50


 20%|█▉        | 390/2000 [1:31:52<8:05:38, 18.10s/it]

Epoch: 389, mean loss: -122.52, mean reward: 15.75


 20%|█▉        | 391/2000 [1:32:09<7:53:04, 17.64s/it]

Epoch: 390, mean loss: -87.08, mean reward: 17.00


 20%|█▉        | 392/2000 [1:32:25<7:36:21, 17.03s/it]

Epoch: 391, mean loss: -75.47, mean reward: 17.25


 20%|█▉        | 393/2000 [1:32:40<7:21:59, 16.50s/it]

Epoch: 392, mean loss: -83.57, mean reward: 17.00


 20%|█▉        | 394/2000 [1:32:56<7:16:00, 16.29s/it]

Epoch: 393, mean loss: -88.11, mean reward: 16.00


 20%|█▉        | 395/2000 [1:33:12<7:14:27, 16.24s/it]

Epoch: 394, mean loss: -102.01, mean reward: 16.50


 20%|█▉        | 396/2000 [1:33:28<7:15:48, 16.30s/it]

Epoch: 395, mean loss: -96.12, mean reward: 15.25


 20%|█▉        | 397/2000 [1:33:46<7:28:37, 16.79s/it]

Epoch: 396, mean loss: -76.10, mean reward: 16.25


 20%|█▉        | 398/2000 [1:34:04<7:38:25, 17.17s/it]

Epoch: 397, mean loss: -65.42, mean reward: 15.50


 20%|█▉        | 399/2000 [1:34:20<7:29:27, 16.84s/it]

Epoch: 398, mean loss: -86.36, mean reward: 16.00


 20%|██        | 400/2000 [1:34:37<7:28:32, 16.82s/it]

Epoch: 399, mean loss: -79.92, mean reward: 17.25


 20%|██        | 401/2000 [1:34:53<7:17:07, 16.40s/it]

Epoch: 400, mean loss: -75.70, mean reward: 17.25


 20%|██        | 402/2000 [1:35:10<7:22:58, 16.63s/it]

Epoch: 401, mean loss: -97.17, mean reward: 14.50


 20%|██        | 403/2000 [1:35:27<7:32:09, 16.99s/it]

Epoch: 402, mean loss: -91.55, mean reward: 14.50


 20%|██        | 404/2000 [1:35:43<7:22:26, 16.63s/it]

Epoch: 403, mean loss: -61.42, mean reward: 15.25


 20%|██        | 405/2000 [1:36:00<7:20:28, 16.57s/it]

Epoch: 404, mean loss: -86.82, mean reward: 15.75


 20%|██        | 406/2000 [1:36:15<7:13:18, 16.31s/it]

Epoch: 405, mean loss: -98.88, mean reward: 15.75


 20%|██        | 407/2000 [1:36:32<7:17:01, 16.46s/it]

Epoch: 406, mean loss: -97.29, mean reward: 16.00


 20%|██        | 408/2000 [1:36:49<7:15:34, 16.42s/it]

Epoch: 407, mean loss: -62.75, mean reward: 18.00


 20%|██        | 409/2000 [1:37:04<7:08:42, 16.17s/it]

Epoch: 408, mean loss: -79.75, mean reward: 17.00


 20%|██        | 410/2000 [1:37:20<7:02:29, 15.94s/it]

Epoch: 409, mean loss: -69.23, mean reward: 19.25


 21%|██        | 411/2000 [1:37:38<7:20:39, 16.64s/it]

Epoch: 410, mean loss: -69.38, mean reward: 16.75


 21%|██        | 412/2000 [1:37:59<7:54:17, 17.92s/it]

Epoch: 411, mean loss: -76.37, mean reward: 18.00


 21%|██        | 413/2000 [1:38:17<7:56:32, 18.02s/it]

Epoch: 412, mean loss: -101.37, mean reward: 16.00


 21%|██        | 414/2000 [1:38:36<8:02:26, 18.25s/it]

Epoch: 413, mean loss: -76.79, mean reward: 14.75


 21%|██        | 415/2000 [1:38:53<7:57:07, 18.06s/it]

Epoch: 414, mean loss: -91.86, mean reward: 17.25


 21%|██        | 416/2000 [1:39:13<8:05:11, 18.38s/it]

Epoch: 415, mean loss: -101.19, mean reward: 17.25


 21%|██        | 417/2000 [1:39:32<8:10:55, 18.61s/it]

Epoch: 416, mean loss: -65.59, mean reward: 18.75


 21%|██        | 418/2000 [1:39:51<8:13:01, 18.70s/it]

Epoch: 417, mean loss: -68.92, mean reward: 17.25


 21%|██        | 419/2000 [1:40:08<8:03:54, 18.36s/it]

Epoch: 418, mean loss: -78.57, mean reward: 16.75


 21%|██        | 420/2000 [1:40:23<7:39:15, 17.44s/it]

Epoch: 419, mean loss: -71.06, mean reward: 19.25


 21%|██        | 421/2000 [1:40:41<7:38:18, 17.41s/it]

Epoch: 420, mean loss: -69.12, mean reward: 19.25


 21%|██        | 422/2000 [1:40:58<7:35:53, 17.33s/it]

Epoch: 421, mean loss: -83.45, mean reward: 17.50


 21%|██        | 423/2000 [1:41:13<7:19:40, 16.73s/it]

Epoch: 422, mean loss: -56.76, mean reward: 19.00


 21%|██        | 424/2000 [1:41:31<7:28:50, 17.09s/it]

Epoch: 423, mean loss: -82.97, mean reward: 18.00


 21%|██▏       | 425/2000 [1:41:46<7:10:49, 16.41s/it]

Epoch: 424, mean loss: -49.00, mean reward: 18.75


 21%|██▏       | 426/2000 [1:42:03<7:11:35, 16.45s/it]

Epoch: 425, mean loss: -86.86, mean reward: 18.00


 21%|██▏       | 427/2000 [1:42:18<7:04:16, 16.18s/it]

Epoch: 426, mean loss: -75.19, mean reward: 18.75


 21%|██▏       | 428/2000 [1:42:32<6:45:59, 15.50s/it]

Epoch: 427, mean loss: -54.75, mean reward: 19.75


 21%|██▏       | 429/2000 [1:42:47<6:43:55, 15.43s/it]

Epoch: 428, mean loss: -47.59, mean reward: 19.50


 22%|██▏       | 430/2000 [1:43:03<6:49:50, 15.66s/it]

Epoch: 429, mean loss: -95.68, mean reward: 19.25


 22%|██▏       | 431/2000 [1:43:25<7:33:26, 17.34s/it]

Epoch: 430, mean loss: -92.76, mean reward: 16.25


 22%|██▏       | 432/2000 [1:43:42<7:33:24, 17.35s/it]

Epoch: 431, mean loss: -97.72, mean reward: 15.75


 22%|██▏       | 433/2000 [1:43:55<7:01:57, 16.16s/it]

Epoch: 432, mean loss: -49.78, mean reward: 19.50


 22%|██▏       | 434/2000 [1:44:13<7:10:27, 16.49s/it]

Epoch: 433, mean loss: -77.56, mean reward: 19.00


 22%|██▏       | 435/2000 [1:44:29<7:04:24, 16.27s/it]

Epoch: 434, mean loss: -69.48, mean reward: 17.75


 22%|██▏       | 436/2000 [1:44:46<7:14:39, 16.68s/it]

Epoch: 435, mean loss: -112.67, mean reward: 15.25


 22%|██▏       | 437/2000 [1:45:00<6:55:31, 15.95s/it]

Epoch: 436, mean loss: -61.79, mean reward: 18.75


 22%|██▏       | 438/2000 [1:45:21<7:32:31, 17.38s/it]

Epoch: 437, mean loss: -76.83, mean reward: 19.75


 22%|██▏       | 439/2000 [1:45:38<7:27:15, 17.19s/it]

Epoch: 438, mean loss: -73.19, mean reward: 18.75


 22%|██▏       | 440/2000 [1:45:52<7:01:40, 16.22s/it]

Epoch: 439, mean loss: -64.56, mean reward: 18.75


 22%|██▏       | 441/2000 [1:46:08<6:59:24, 16.14s/it]

Epoch: 440, mean loss: -65.70, mean reward: 18.50


 22%|██▏       | 442/2000 [1:46:24<7:00:32, 16.20s/it]

Epoch: 441, mean loss: -62.99, mean reward: 18.00


 22%|██▏       | 443/2000 [1:46:39<6:50:34, 15.82s/it]

Epoch: 442, mean loss: -67.77, mean reward: 19.00


 22%|██▏       | 444/2000 [1:46:52<6:30:01, 15.04s/it]

Epoch: 443, mean loss: -20.78, mean reward: 20.50


 22%|██▏       | 445/2000 [1:47:07<6:25:18, 14.87s/it]

Epoch: 444, mean loss: -67.11, mean reward: 18.25


 22%|██▏       | 446/2000 [1:47:22<6:28:00, 14.98s/it]

Epoch: 445, mean loss: -66.98, mean reward: 19.25


 22%|██▏       | 447/2000 [1:47:38<6:37:45, 15.37s/it]

Epoch: 446, mean loss: -95.78, mean reward: 15.25


 22%|██▏       | 448/2000 [1:47:53<6:35:59, 15.31s/it]

Epoch: 447, mean loss: -83.37, mean reward: 17.75


 22%|██▏       | 449/2000 [1:48:08<6:27:55, 15.01s/it]

Epoch: 448, mean loss: -67.06, mean reward: 20.00


 22%|██▎       | 450/2000 [1:48:24<6:35:54, 15.33s/it]

Epoch: 449, mean loss: -76.47, mean reward: 18.75


 23%|██▎       | 451/2000 [1:48:40<6:45:38, 15.71s/it]

Epoch: 450, mean loss: -103.22, mean reward: 17.00


 23%|██▎       | 452/2000 [1:48:56<6:45:44, 15.73s/it]

Epoch: 451, mean loss: -57.08, mean reward: 19.00


 23%|██▎       | 453/2000 [1:49:12<6:46:49, 15.78s/it]

Epoch: 452, mean loss: -79.40, mean reward: 18.50


 23%|██▎       | 454/2000 [1:49:27<6:42:28, 15.62s/it]

Epoch: 453, mean loss: -70.98, mean reward: 19.00


 23%|██▎       | 455/2000 [1:49:42<6:36:15, 15.39s/it]

Epoch: 454, mean loss: -66.26, mean reward: 19.75


 23%|██▎       | 456/2000 [1:49:55<6:18:06, 14.69s/it]

Epoch: 455, mean loss: -40.73, mean reward: 19.50


 23%|██▎       | 457/2000 [1:50:09<6:08:58, 14.35s/it]

Epoch: 456, mean loss: -39.31, mean reward: 20.00


 23%|██▎       | 458/2000 [1:50:23<6:07:19, 14.29s/it]

Epoch: 457, mean loss: -69.15, mean reward: 19.00


 23%|██▎       | 459/2000 [1:50:39<6:19:14, 14.77s/it]

Epoch: 458, mean loss: -108.96, mean reward: 17.75


 23%|██▎       | 460/2000 [1:50:53<6:15:33, 14.63s/it]

Epoch: 459, mean loss: -51.84, mean reward: 19.75


 23%|██▎       | 461/2000 [1:51:09<6:24:33, 14.99s/it]

Epoch: 460, mean loss: -51.11, mean reward: 18.75


 23%|██▎       | 462/2000 [1:51:25<6:33:15, 15.34s/it]

Epoch: 461, mean loss: -64.70, mean reward: 18.25


 23%|██▎       | 463/2000 [1:51:42<6:46:43, 15.88s/it]

Epoch: 462, mean loss: -53.67, mean reward: 19.75


 23%|██▎       | 464/2000 [1:51:59<6:52:17, 16.11s/it]

Epoch: 463, mean loss: -48.58, mean reward: 17.50


 23%|██▎       | 465/2000 [1:52:18<7:18:37, 17.14s/it]

Epoch: 464, mean loss: -74.24, mean reward: 17.25


 23%|██▎       | 466/2000 [1:52:37<7:31:25, 17.66s/it]

Epoch: 465, mean loss: -61.35, mean reward: 20.25


 23%|██▎       | 467/2000 [1:52:55<7:30:14, 17.62s/it]

Epoch: 466, mean loss: -54.46, mean reward: 20.00


 23%|██▎       | 468/2000 [1:53:12<7:22:52, 17.35s/it]

Epoch: 467, mean loss: -45.85, mean reward: 19.75


 23%|██▎       | 469/2000 [1:53:32<7:47:32, 18.32s/it]

Epoch: 468, mean loss: -59.15, mean reward: 17.00


 24%|██▎       | 470/2000 [1:53:51<7:54:03, 18.59s/it]

Epoch: 469, mean loss: -58.98, mean reward: 19.00


 24%|██▎       | 471/2000 [1:54:10<7:54:05, 18.60s/it]

Epoch: 470, mean loss: -46.66, mean reward: 19.75


 24%|██▎       | 472/2000 [1:54:30<8:02:43, 18.96s/it]

Epoch: 471, mean loss: -44.97, mean reward: 19.25


 24%|██▎       | 473/2000 [1:54:46<7:42:01, 18.15s/it]

Epoch: 472, mean loss: -62.61, mean reward: 18.75


 24%|██▎       | 474/2000 [1:55:02<7:26:00, 17.54s/it]

Epoch: 473, mean loss: -62.26, mean reward: 19.50


 24%|██▍       | 475/2000 [1:55:20<7:27:30, 17.61s/it]

Epoch: 474, mean loss: -71.49, mean reward: 19.25


 24%|██▍       | 476/2000 [1:55:37<7:26:04, 17.56s/it]

Epoch: 475, mean loss: -60.26, mean reward: 19.75


 24%|██▍       | 477/2000 [1:55:53<7:13:21, 17.07s/it]

Epoch: 476, mean loss: -45.44, mean reward: 19.50


 24%|██▍       | 478/2000 [1:56:11<7:14:43, 17.14s/it]

Epoch: 477, mean loss: -50.62, mean reward: 18.25


 24%|██▍       | 479/2000 [1:56:25<6:57:29, 16.47s/it]

Epoch: 478, mean loss: -58.67, mean reward: 20.00


 24%|██▍       | 480/2000 [1:56:41<6:53:04, 16.31s/it]

Epoch: 479, mean loss: -32.48, mean reward: 20.25


 24%|██▍       | 481/2000 [1:56:59<7:00:00, 16.59s/it]

Epoch: 480, mean loss: -67.82, mean reward: 18.50


 24%|██▍       | 482/2000 [1:57:15<7:00:09, 16.61s/it]

Epoch: 481, mean loss: -34.12, mean reward: 20.00


 24%|██▍       | 483/2000 [1:57:31<6:51:58, 16.29s/it]

Epoch: 482, mean loss: -56.62, mean reward: 19.50


 24%|██▍       | 484/2000 [1:57:51<7:22:24, 17.51s/it]

Epoch: 483, mean loss: -72.88, mean reward: 19.75


 24%|██▍       | 485/2000 [1:58:08<7:16:53, 17.30s/it]

Epoch: 484, mean loss: -55.90, mean reward: 20.00


 24%|██▍       | 486/2000 [1:58:25<7:12:24, 17.14s/it]

Epoch: 485, mean loss: -47.38, mean reward: 20.50


 24%|██▍       | 487/2000 [1:58:44<7:25:32, 17.67s/it]

Epoch: 486, mean loss: -54.19, mean reward: 19.75


 24%|██▍       | 488/2000 [1:58:59<7:05:04, 16.87s/it]

Epoch: 487, mean loss: -59.63, mean reward: 20.00


 24%|██▍       | 489/2000 [1:59:14<6:55:50, 16.51s/it]

Epoch: 488, mean loss: -48.89, mean reward: 18.25


 24%|██▍       | 490/2000 [1:59:32<7:03:53, 16.84s/it]

Epoch: 489, mean loss: -74.87, mean reward: 18.00


 25%|██▍       | 491/2000 [1:59:49<7:05:31, 16.92s/it]

Epoch: 490, mean loss: -31.40, mean reward: 18.75


 25%|██▍       | 492/2000 [2:00:08<7:17:37, 17.41s/it]

Epoch: 491, mean loss: -48.10, mean reward: 18.50


 25%|██▍       | 493/2000 [2:00:23<7:04:14, 16.89s/it]

Epoch: 492, mean loss: -26.69, mean reward: 19.75


 25%|██▍       | 494/2000 [2:00:43<7:28:11, 17.86s/it]

Epoch: 493, mean loss: -61.20, mean reward: 19.00


 25%|██▍       | 495/2000 [2:01:00<7:19:15, 17.51s/it]

Epoch: 494, mean loss: -70.94, mean reward: 19.00


 25%|██▍       | 496/2000 [2:01:16<7:05:57, 16.99s/it]

Epoch: 495, mean loss: -52.05, mean reward: 18.75


 25%|██▍       | 497/2000 [2:01:33<7:08:46, 17.12s/it]

Epoch: 496, mean loss: -47.84, mean reward: 18.50


 25%|██▍       | 498/2000 [2:01:50<7:05:45, 17.01s/it]

Epoch: 497, mean loss: -27.71, mean reward: 20.25


 25%|██▍       | 499/2000 [2:02:06<6:55:33, 16.61s/it]

Epoch: 498, mean loss: -47.91, mean reward: 20.00


 25%|██▌       | 500/2000 [2:02:20<6:34:31, 15.78s/it]

Epoch: 499, mean loss: -36.74, mean reward: 19.25


 25%|██▌       | 501/2000 [2:02:39<6:59:34, 16.79s/it]

Epoch: 500, mean loss: -40.49, mean reward: 19.25


 25%|██▌       | 502/2000 [2:02:56<7:02:02, 16.90s/it]

Epoch: 501, mean loss: -40.14, mean reward: 20.00


 25%|██▌       | 503/2000 [2:03:13<7:03:41, 16.98s/it]

Epoch: 502, mean loss: -60.01, mean reward: 19.25


 25%|██▌       | 504/2000 [2:03:29<6:52:42, 16.55s/it]

Epoch: 503, mean loss: -54.91, mean reward: 20.25


 25%|██▌       | 505/2000 [2:03:44<6:46:47, 16.33s/it]

Epoch: 504, mean loss: -22.98, mean reward: 20.50


 25%|██▌       | 506/2000 [2:04:01<6:49:02, 16.43s/it]

Epoch: 505, mean loss: -43.69, mean reward: 19.00


 25%|██▌       | 507/2000 [2:04:19<7:01:40, 16.95s/it]

Epoch: 506, mean loss: -73.44, mean reward: 19.25


 25%|██▌       | 508/2000 [2:04:40<7:26:02, 17.94s/it]

Epoch: 507, mean loss: -20.12, mean reward: 19.50


 25%|██▌       | 509/2000 [2:04:57<7:24:59, 17.91s/it]

Epoch: 508, mean loss: -64.50, mean reward: 18.00


 26%|██▌       | 510/2000 [2:05:14<7:18:31, 17.66s/it]

Epoch: 509, mean loss: -42.27, mean reward: 19.50


 26%|██▌       | 511/2000 [2:05:31<7:09:23, 17.30s/it]

Epoch: 510, mean loss: -44.35, mean reward: 19.00


 26%|██▌       | 512/2000 [2:05:48<7:06:19, 17.19s/it]

Epoch: 511, mean loss: -44.26, mean reward: 18.50


 26%|██▌       | 513/2000 [2:06:04<6:54:40, 16.73s/it]

Epoch: 512, mean loss: -48.75, mean reward: 19.75


 26%|██▌       | 514/2000 [2:06:20<6:52:55, 16.67s/it]

Epoch: 513, mean loss: -65.07, mean reward: 19.75


 26%|██▌       | 515/2000 [2:06:37<6:55:50, 16.80s/it]

Epoch: 514, mean loss: -40.91, mean reward: 19.25


 26%|██▌       | 516/2000 [2:06:52<6:39:32, 16.15s/it]

Epoch: 515, mean loss: -43.14, mean reward: 20.00


 26%|██▌       | 517/2000 [2:07:07<6:34:24, 15.96s/it]

Epoch: 516, mean loss: -32.05, mean reward: 20.25


 26%|██▌       | 518/2000 [2:07:23<6:29:03, 15.75s/it]

Epoch: 517, mean loss: -33.74, mean reward: 19.50


 26%|██▌       | 519/2000 [2:07:39<6:37:31, 16.11s/it]

Epoch: 518, mean loss: -44.72, mean reward: 17.25


 26%|██▌       | 520/2000 [2:07:55<6:31:35, 15.88s/it]

Epoch: 519, mean loss: -38.53, mean reward: 19.50


 26%|██▌       | 521/2000 [2:08:09<6:20:56, 15.45s/it]

Epoch: 520, mean loss: -33.72, mean reward: 20.25


 26%|██▌       | 522/2000 [2:08:25<6:25:38, 15.66s/it]

Epoch: 521, mean loss: -57.67, mean reward: 20.00


 26%|██▌       | 523/2000 [2:08:43<6:42:30, 16.35s/it]

Epoch: 522, mean loss: -42.39, mean reward: 19.75


 26%|██▌       | 524/2000 [2:09:00<6:43:56, 16.42s/it]

Epoch: 523, mean loss: -57.84, mean reward: 19.00


 26%|██▋       | 525/2000 [2:09:17<6:51:25, 16.74s/it]

Epoch: 524, mean loss: -44.17, mean reward: 19.00


 26%|██▋       | 526/2000 [2:09:34<6:46:06, 16.53s/it]

Epoch: 525, mean loss: -37.27, mean reward: 20.25


 26%|██▋       | 527/2000 [2:09:48<6:33:38, 16.03s/it]

Epoch: 526, mean loss: -60.79, mean reward: 20.00


 26%|██▋       | 528/2000 [2:10:02<6:15:15, 15.30s/it]

Epoch: 527, mean loss: -39.98, mean reward: 20.75


 26%|██▋       | 529/2000 [2:10:17<6:16:50, 15.37s/it]

Epoch: 528, mean loss: -54.54, mean reward: 20.00


 26%|██▋       | 530/2000 [2:10:33<6:15:15, 15.32s/it]

Epoch: 529, mean loss: -52.08, mean reward: 20.00


 27%|██▋       | 531/2000 [2:10:48<6:18:27, 15.46s/it]

Epoch: 530, mean loss: -22.58, mean reward: 20.75


 27%|██▋       | 532/2000 [2:11:05<6:29:10, 15.91s/it]

Epoch: 531, mean loss: -53.37, mean reward: 17.75


 27%|██▋       | 533/2000 [2:11:21<6:28:44, 15.90s/it]

Epoch: 532, mean loss: -44.30, mean reward: 19.25


 27%|██▋       | 534/2000 [2:11:35<6:15:54, 15.38s/it]

Epoch: 533, mean loss: -27.53, mean reward: 20.50


 27%|██▋       | 535/2000 [2:11:51<6:15:35, 15.38s/it]

Epoch: 534, mean loss: -36.46, mean reward: 20.50


 27%|██▋       | 536/2000 [2:12:06<6:13:52, 15.32s/it]

Epoch: 535, mean loss: -37.33, mean reward: 20.50


 27%|██▋       | 537/2000 [2:12:21<6:08:00, 15.09s/it]

Epoch: 536, mean loss: -48.50, mean reward: 20.00


 27%|██▋       | 538/2000 [2:12:35<6:02:51, 14.89s/it]

Epoch: 537, mean loss: -42.20, mean reward: 19.50


 27%|██▋       | 539/2000 [2:12:49<5:58:48, 14.74s/it]

Epoch: 538, mean loss: -40.48, mean reward: 20.25


 27%|██▋       | 540/2000 [2:13:04<6:00:26, 14.81s/it]

Epoch: 539, mean loss: -30.59, mean reward: 19.75


 27%|██▋       | 541/2000 [2:13:20<6:08:56, 15.17s/it]

Epoch: 540, mean loss: -51.59, mean reward: 19.50


 27%|██▋       | 542/2000 [2:13:37<6:15:25, 15.45s/it]

Epoch: 541, mean loss: -53.45, mean reward: 20.75


 27%|██▋       | 543/2000 [2:13:52<6:13:53, 15.40s/it]

Epoch: 542, mean loss: -35.15, mean reward: 20.25


 27%|██▋       | 544/2000 [2:14:08<6:20:25, 15.68s/it]

Epoch: 543, mean loss: -50.36, mean reward: 20.25


 27%|██▋       | 545/2000 [2:14:23<6:13:39, 15.41s/it]

Epoch: 544, mean loss: -37.17, mean reward: 20.00


 27%|██▋       | 546/2000 [2:14:39<6:17:23, 15.57s/it]

Epoch: 545, mean loss: -33.93, mean reward: 19.50


 27%|██▋       | 547/2000 [2:14:55<6:23:30, 15.84s/it]

Epoch: 546, mean loss: -39.79, mean reward: 19.75


 27%|██▋       | 548/2000 [2:15:12<6:29:52, 16.11s/it]

Epoch: 547, mean loss: -59.10, mean reward: 19.50


 27%|██▋       | 549/2000 [2:15:29<6:35:37, 16.36s/it]

Epoch: 548, mean loss: -43.84, mean reward: 19.25


 28%|██▊       | 550/2000 [2:15:44<6:26:22, 15.99s/it]

Epoch: 549, mean loss: -52.57, mean reward: 20.50


 28%|██▊       | 551/2000 [2:16:00<6:23:06, 15.86s/it]

Epoch: 550, mean loss: -65.19, mean reward: 20.00


 28%|██▊       | 552/2000 [2:16:16<6:23:30, 15.89s/it]

Epoch: 551, mean loss: -39.08, mean reward: 20.00


 28%|██▊       | 553/2000 [2:16:30<6:15:17, 15.56s/it]

Epoch: 552, mean loss: -23.05, mean reward: 20.75


 28%|██▊       | 554/2000 [2:16:45<6:06:21, 15.20s/it]

Epoch: 553, mean loss: -37.46, mean reward: 20.25


 28%|██▊       | 555/2000 [2:17:00<6:08:06, 15.28s/it]

Epoch: 554, mean loss: -55.50, mean reward: 19.50


 28%|██▊       | 556/2000 [2:17:15<6:03:29, 15.10s/it]

Epoch: 555, mean loss: -29.04, mean reward: 21.00


 28%|██▊       | 557/2000 [2:17:29<5:56:28, 14.82s/it]

Epoch: 556, mean loss: -34.83, mean reward: 20.50


 28%|██▊       | 558/2000 [2:17:43<5:48:25, 14.50s/it]

Epoch: 557, mean loss: -30.44, mean reward: 20.00


 28%|██▊       | 559/2000 [2:17:58<5:50:23, 14.59s/it]

Epoch: 558, mean loss: -43.13, mean reward: 20.75


 28%|██▊       | 560/2000 [2:18:13<5:58:04, 14.92s/it]

Epoch: 559, mean loss: -26.22, mean reward: 20.75


 28%|██▊       | 561/2000 [2:18:29<6:03:33, 15.16s/it]

Epoch: 560, mean loss: -32.78, mean reward: 20.25


 28%|██▊       | 562/2000 [2:18:44<6:04:40, 15.22s/it]

Epoch: 561, mean loss: -47.41, mean reward: 19.25


 28%|██▊       | 563/2000 [2:19:00<6:04:02, 15.20s/it]

Epoch: 562, mean loss: -41.60, mean reward: 20.00


 28%|██▊       | 564/2000 [2:19:20<6:43:34, 16.86s/it]

Epoch: 563, mean loss: -38.62, mean reward: 20.50


 28%|██▊       | 565/2000 [2:19:37<6:41:22, 16.78s/it]

Epoch: 564, mean loss: -24.57, mean reward: 20.75


 28%|██▊       | 566/2000 [2:19:54<6:45:30, 16.97s/it]

Epoch: 565, mean loss: -56.61, mean reward: 20.50


 28%|██▊       | 567/2000 [2:20:10<6:38:29, 16.69s/it]

Epoch: 566, mean loss: -52.90, mean reward: 20.75


 28%|██▊       | 568/2000 [2:20:26<6:29:29, 16.32s/it]

Epoch: 567, mean loss: -50.47, mean reward: 20.75


 28%|██▊       | 569/2000 [2:20:42<6:30:25, 16.37s/it]

Epoch: 568, mean loss: -49.32, mean reward: 19.75


 28%|██▊       | 570/2000 [2:20:56<6:14:27, 15.71s/it]

Epoch: 569, mean loss: -33.12, mean reward: 20.75


 29%|██▊       | 571/2000 [2:21:12<6:12:38, 15.65s/it]

Epoch: 570, mean loss: -32.06, mean reward: 20.50


 29%|██▊       | 572/2000 [2:21:29<6:19:06, 15.93s/it]

Epoch: 571, mean loss: -59.71, mean reward: 19.25


 29%|██▊       | 573/2000 [2:21:43<6:06:37, 15.42s/it]

Epoch: 572, mean loss: -34.48, mean reward: 20.75


 29%|██▊       | 574/2000 [2:21:57<6:00:25, 15.16s/it]

Epoch: 573, mean loss: -46.34, mean reward: 21.00


 29%|██▉       | 575/2000 [2:22:12<5:57:50, 15.07s/it]

Epoch: 574, mean loss: -36.37, mean reward: 19.25


 29%|██▉       | 576/2000 [2:22:27<5:58:56, 15.12s/it]

Epoch: 575, mean loss: -40.60, mean reward: 18.75


 29%|██▉       | 577/2000 [2:22:42<5:54:51, 14.96s/it]

Epoch: 576, mean loss: -47.37, mean reward: 19.75


 29%|██▉       | 578/2000 [2:22:58<6:03:44, 15.35s/it]

Epoch: 577, mean loss: -62.90, mean reward: 20.25


 29%|██▉       | 579/2000 [2:23:14<6:03:21, 15.34s/it]

Epoch: 578, mean loss: -33.58, mean reward: 21.00


 29%|██▉       | 580/2000 [2:23:30<6:08:06, 15.55s/it]

Epoch: 579, mean loss: -33.86, mean reward: 19.75


 29%|██▉       | 581/2000 [2:23:46<6:16:24, 15.92s/it]

Epoch: 580, mean loss: -53.69, mean reward: 19.75


 29%|██▉       | 582/2000 [2:24:03<6:17:23, 15.97s/it]

Epoch: 581, mean loss: -56.35, mean reward: 20.25


 29%|██▉       | 583/2000 [2:24:18<6:13:50, 15.83s/it]

Epoch: 582, mean loss: -39.89, mean reward: 20.00


 29%|██▉       | 584/2000 [2:24:33<6:09:41, 15.66s/it]

Epoch: 583, mean loss: -27.11, mean reward: 21.00


 29%|██▉       | 585/2000 [2:24:48<5:59:54, 15.26s/it]

Epoch: 584, mean loss: -59.25, mean reward: 20.50


 29%|██▉       | 586/2000 [2:25:03<6:01:46, 15.35s/it]

Epoch: 585, mean loss: -28.96, mean reward: 21.00


 29%|██▉       | 587/2000 [2:25:20<6:09:51, 15.71s/it]

Epoch: 586, mean loss: -24.46, mean reward: 21.00


 29%|██▉       | 588/2000 [2:25:35<6:04:42, 15.50s/it]

Epoch: 587, mean loss: -36.23, mean reward: 20.75


 29%|██▉       | 589/2000 [2:25:50<6:06:15, 15.57s/it]

Epoch: 588, mean loss: -40.14, mean reward: 20.75


 30%|██▉       | 590/2000 [2:26:06<6:02:59, 15.45s/it]

Epoch: 589, mean loss: -53.65, mean reward: 20.50


 30%|██▉       | 591/2000 [2:26:22<6:06:35, 15.61s/it]

Epoch: 590, mean loss: -49.84, mean reward: 19.75


 30%|██▉       | 592/2000 [2:26:35<5:52:42, 15.03s/it]

Epoch: 591, mean loss: -43.69, mean reward: 20.25


 30%|██▉       | 593/2000 [2:26:50<5:51:57, 15.01s/it]

Epoch: 592, mean loss: -44.34, mean reward: 20.50


 30%|██▉       | 594/2000 [2:27:05<5:49:38, 14.92s/it]

Epoch: 593, mean loss: -60.31, mean reward: 19.00


 30%|██▉       | 595/2000 [2:27:20<5:52:01, 15.03s/it]

Epoch: 594, mean loss: -50.23, mean reward: 20.50


 30%|██▉       | 596/2000 [2:27:36<5:56:41, 15.24s/it]

Epoch: 595, mean loss: -42.52, mean reward: 20.75


 30%|██▉       | 597/2000 [2:27:52<6:00:02, 15.40s/it]

Epoch: 596, mean loss: -61.41, mean reward: 19.25


 30%|██▉       | 598/2000 [2:28:06<5:52:33, 15.09s/it]

Epoch: 597, mean loss: -57.82, mean reward: 20.00


 30%|██▉       | 599/2000 [2:28:21<5:52:57, 15.12s/it]

Epoch: 598, mean loss: -51.20, mean reward: 20.50


 30%|███       | 600/2000 [2:28:37<5:55:55, 15.25s/it]

Epoch: 599, mean loss: -48.51, mean reward: 20.75


 30%|███       | 601/2000 [2:28:54<6:09:20, 15.84s/it]

Epoch: 600, mean loss: -61.75, mean reward: 20.25


 30%|███       | 602/2000 [2:29:10<6:08:17, 15.81s/it]

Epoch: 601, mean loss: -56.00, mean reward: 20.75


 30%|███       | 603/2000 [2:29:25<6:02:27, 15.57s/it]

Epoch: 602, mean loss: -52.64, mean reward: 20.25


 30%|███       | 604/2000 [2:29:42<6:13:35, 16.06s/it]

Epoch: 603, mean loss: -62.82, mean reward: 20.00


 30%|███       | 605/2000 [2:29:57<6:04:33, 15.68s/it]

Epoch: 604, mean loss: -42.21, mean reward: 20.75


 30%|███       | 606/2000 [2:30:11<5:56:49, 15.36s/it]

Epoch: 605, mean loss: -64.24, mean reward: 20.50


 30%|███       | 607/2000 [2:30:26<5:52:12, 15.17s/it]

Epoch: 606, mean loss: -55.18, mean reward: 20.50


 30%|███       | 608/2000 [2:30:41<5:51:33, 15.15s/it]

Epoch: 607, mean loss: -58.42, mean reward: 20.00


 30%|███       | 609/2000 [2:30:56<5:49:43, 15.09s/it]

Epoch: 608, mean loss: -40.48, mean reward: 20.50


 30%|███       | 610/2000 [2:31:11<5:44:01, 14.85s/it]

Epoch: 609, mean loss: -54.01, mean reward: 19.75


 31%|███       | 611/2000 [2:31:25<5:38:55, 14.64s/it]

Epoch: 610, mean loss: -31.37, mean reward: 20.50


 31%|███       | 612/2000 [2:31:40<5:43:40, 14.86s/it]

Epoch: 611, mean loss: -38.00, mean reward: 21.00


 31%|███       | 613/2000 [2:31:55<5:44:40, 14.91s/it]

Epoch: 612, mean loss: -43.99, mean reward: 21.00


 31%|███       | 614/2000 [2:32:10<5:45:30, 14.96s/it]

Epoch: 613, mean loss: -44.51, mean reward: 21.00


 31%|███       | 615/2000 [2:32:25<5:43:29, 14.88s/it]

Epoch: 614, mean loss: -30.76, mean reward: 21.00


 31%|███       | 616/2000 [2:32:38<5:29:19, 14.28s/it]

Epoch: 615, mean loss: -42.47, mean reward: 20.25


 31%|███       | 617/2000 [2:32:55<5:46:50, 15.05s/it]

Epoch: 616, mean loss: -60.77, mean reward: 20.25


 31%|███       | 618/2000 [2:33:11<5:59:05, 15.59s/it]

Epoch: 617, mean loss: -41.20, mean reward: 20.75


 31%|███       | 619/2000 [2:33:28<6:05:19, 15.87s/it]

Epoch: 618, mean loss: -58.04, mean reward: 20.00


 31%|███       | 620/2000 [2:33:45<6:14:17, 16.27s/it]

Epoch: 619, mean loss: -60.85, mean reward: 20.25


 31%|███       | 621/2000 [2:34:00<6:01:32, 15.73s/it]

Epoch: 620, mean loss: -39.89, mean reward: 21.00


 31%|███       | 622/2000 [2:34:16<6:03:51, 15.84s/it]

Epoch: 621, mean loss: -53.23, mean reward: 21.00


 31%|███       | 623/2000 [2:34:32<6:06:19, 15.96s/it]

Epoch: 622, mean loss: -60.54, mean reward: 20.25


 31%|███       | 624/2000 [2:34:47<6:01:41, 15.77s/it]

Epoch: 623, mean loss: -45.85, mean reward: 20.50


 31%|███▏      | 625/2000 [2:35:03<5:59:35, 15.69s/it]

Epoch: 624, mean loss: -34.10, mean reward: 20.00


 31%|███▏      | 626/2000 [2:35:19<6:01:32, 15.79s/it]

Epoch: 625, mean loss: -41.21, mean reward: 19.50


 31%|███▏      | 627/2000 [2:35:34<5:57:41, 15.63s/it]

Epoch: 626, mean loss: -54.00, mean reward: 20.75


 31%|███▏      | 628/2000 [2:35:49<5:55:14, 15.54s/it]

Epoch: 627, mean loss: -49.36, mean reward: 20.25


 31%|███▏      | 629/2000 [2:36:04<5:48:33, 15.25s/it]

Epoch: 628, mean loss: -41.57, mean reward: 21.00


 32%|███▏      | 630/2000 [2:36:19<5:43:57, 15.06s/it]

Epoch: 629, mean loss: -41.09, mean reward: 21.00


 32%|███▏      | 631/2000 [2:36:34<5:47:25, 15.23s/it]

Epoch: 630, mean loss: -67.70, mean reward: 20.50


 32%|███▏      | 632/2000 [2:36:49<5:42:34, 15.03s/it]

Epoch: 631, mean loss: -43.55, mean reward: 21.00


 32%|███▏      | 633/2000 [2:37:04<5:44:39, 15.13s/it]

Epoch: 632, mean loss: -58.47, mean reward: 20.75


 32%|███▏      | 634/2000 [2:37:19<5:45:34, 15.18s/it]

Epoch: 633, mean loss: -53.17, mean reward: 20.50


 32%|███▏      | 635/2000 [2:37:35<5:45:06, 15.17s/it]

Epoch: 634, mean loss: -40.11, mean reward: 21.00


 32%|███▏      | 636/2000 [2:37:51<5:52:57, 15.53s/it]

Epoch: 635, mean loss: -69.79, mean reward: 19.50


 32%|███▏      | 637/2000 [2:38:06<5:50:00, 15.41s/it]

Epoch: 636, mean loss: -48.54, mean reward: 21.00


 32%|███▏      | 638/2000 [2:38:21<5:45:21, 15.21s/it]

Epoch: 637, mean loss: -46.53, mean reward: 21.00


 32%|███▏      | 639/2000 [2:38:37<5:50:11, 15.44s/it]

Epoch: 638, mean loss: -55.87, mean reward: 20.00


 32%|███▏      | 640/2000 [2:38:53<5:52:03, 15.53s/it]

Epoch: 639, mean loss: -58.08, mean reward: 20.75


 32%|███▏      | 641/2000 [2:39:08<5:50:04, 15.46s/it]

Epoch: 640, mean loss: -39.97, mean reward: 20.75


 32%|███▏      | 642/2000 [2:39:24<5:53:29, 15.62s/it]

Epoch: 641, mean loss: -73.11, mean reward: 20.50


 32%|███▏      | 643/2000 [2:39:40<5:57:25, 15.80s/it]

Epoch: 642, mean loss: -57.81, mean reward: 20.00


 32%|███▏      | 644/2000 [2:39:55<5:50:54, 15.53s/it]

Epoch: 643, mean loss: -28.50, mean reward: 20.50


 32%|███▏      | 645/2000 [2:40:10<5:50:57, 15.54s/it]

Epoch: 644, mean loss: -70.78, mean reward: 20.25


 32%|███▏      | 646/2000 [2:40:26<5:53:43, 15.67s/it]

Epoch: 645, mean loss: -46.56, mean reward: 20.25


 32%|███▏      | 647/2000 [2:40:42<5:51:06, 15.57s/it]

Epoch: 646, mean loss: -73.39, mean reward: 20.50


 32%|███▏      | 648/2000 [2:40:57<5:51:26, 15.60s/it]

Epoch: 647, mean loss: -51.89, mean reward: 20.50


 32%|███▏      | 649/2000 [2:41:13<5:52:31, 15.66s/it]

Epoch: 648, mean loss: -52.83, mean reward: 20.75


 32%|███▎      | 650/2000 [2:41:29<5:49:36, 15.54s/it]

Epoch: 649, mean loss: -60.94, mean reward: 20.75


 33%|███▎      | 651/2000 [2:41:44<5:48:15, 15.49s/it]

Epoch: 650, mean loss: -42.69, mean reward: 20.25


 33%|███▎      | 652/2000 [2:41:59<5:47:09, 15.45s/it]

Epoch: 651, mean loss: -53.17, mean reward: 20.25


 33%|███▎      | 653/2000 [2:42:16<5:52:28, 15.70s/it]

Epoch: 652, mean loss: -68.16, mean reward: 20.25


 33%|███▎      | 654/2000 [2:42:31<5:53:13, 15.75s/it]

Epoch: 653, mean loss: -55.06, mean reward: 20.25


 33%|███▎      | 655/2000 [2:42:47<5:50:06, 15.62s/it]

Epoch: 654, mean loss: -63.43, mean reward: 20.25


 33%|███▎      | 656/2000 [2:43:03<5:52:22, 15.73s/it]

Epoch: 655, mean loss: -44.52, mean reward: 20.50


 33%|███▎      | 657/2000 [2:43:18<5:47:29, 15.52s/it]

Epoch: 656, mean loss: -48.63, mean reward: 20.25


 33%|███▎      | 658/2000 [2:43:33<5:43:01, 15.34s/it]

Epoch: 657, mean loss: -64.10, mean reward: 20.50


 33%|███▎      | 659/2000 [2:43:51<6:01:26, 16.17s/it]

Epoch: 658, mean loss: -60.40, mean reward: 19.50


 33%|███▎      | 660/2000 [2:44:07<6:02:45, 16.24s/it]

Epoch: 659, mean loss: -54.45, mean reward: 20.50


 33%|███▎      | 661/2000 [2:44:25<6:14:07, 16.76s/it]

Epoch: 660, mean loss: -66.20, mean reward: 20.50


 33%|███▎      | 662/2000 [2:44:41<6:10:59, 16.64s/it]

Epoch: 661, mean loss: -42.39, mean reward: 21.00


 33%|███▎      | 663/2000 [2:44:58<6:10:57, 16.65s/it]

Epoch: 662, mean loss: -75.70, mean reward: 19.75


 33%|███▎      | 664/2000 [2:45:13<6:00:08, 16.17s/it]

Epoch: 663, mean loss: -57.89, mean reward: 20.50


 33%|███▎      | 665/2000 [2:45:29<5:55:34, 15.98s/it]

Epoch: 664, mean loss: -47.48, mean reward: 21.00


 33%|███▎      | 666/2000 [2:45:44<5:49:39, 15.73s/it]

Epoch: 665, mean loss: -48.11, mean reward: 20.75


 33%|███▎      | 667/2000 [2:46:00<5:54:15, 15.95s/it]

Epoch: 666, mean loss: -64.02, mean reward: 20.00


 33%|███▎      | 668/2000 [2:46:16<5:52:25, 15.87s/it]

Epoch: 667, mean loss: -35.37, mean reward: 20.75


 33%|███▎      | 669/2000 [2:46:31<5:48:22, 15.70s/it]

Epoch: 668, mean loss: -42.67, mean reward: 20.75


 34%|███▎      | 670/2000 [2:46:46<5:42:49, 15.47s/it]

Epoch: 669, mean loss: -64.25, mean reward: 20.25


 34%|███▎      | 671/2000 [2:47:02<5:42:01, 15.44s/it]

Epoch: 670, mean loss: -29.49, mean reward: 20.50


 34%|███▎      | 672/2000 [2:47:17<5:41:30, 15.43s/it]

Epoch: 671, mean loss: -28.29, mean reward: 20.50


 34%|███▎      | 673/2000 [2:47:34<5:52:30, 15.94s/it]

Epoch: 672, mean loss: -36.60, mean reward: 19.75


 34%|███▎      | 674/2000 [2:47:49<5:46:12, 15.67s/it]

Epoch: 673, mean loss: -55.55, mean reward: 20.75


 34%|███▍      | 675/2000 [2:48:04<5:40:34, 15.42s/it]

Epoch: 674, mean loss: -24.80, mean reward: 20.75


 34%|███▍      | 676/2000 [2:48:19<5:37:00, 15.27s/it]

Epoch: 675, mean loss: -43.07, mean reward: 20.75


 34%|███▍      | 677/2000 [2:48:36<5:49:09, 15.83s/it]

Epoch: 676, mean loss: -68.92, mean reward: 20.25


 34%|███▍      | 678/2000 [2:48:50<5:38:21, 15.36s/it]

Epoch: 677, mean loss: -15.80, mean reward: 21.00


 34%|███▍      | 679/2000 [2:49:07<5:46:55, 15.76s/it]

Epoch: 678, mean loss: -43.63, mean reward: 20.50


 34%|███▍      | 680/2000 [2:49:24<5:51:56, 16.00s/it]

Epoch: 679, mean loss: -42.35, mean reward: 20.50


 34%|███▍      | 681/2000 [2:49:39<5:44:36, 15.68s/it]

Epoch: 680, mean loss: -40.78, mean reward: 20.50


 34%|███▍      | 682/2000 [2:49:54<5:40:53, 15.52s/it]

Epoch: 681, mean loss: -30.06, mean reward: 20.75


 34%|███▍      | 683/2000 [2:50:09<5:36:44, 15.34s/it]

Epoch: 682, mean loss: -39.19, mean reward: 20.50


 34%|███▍      | 684/2000 [2:50:23<5:31:31, 15.11s/it]

Epoch: 683, mean loss: -21.85, mean reward: 21.00


 34%|███▍      | 685/2000 [2:50:39<5:38:31, 15.45s/it]

Epoch: 684, mean loss: -15.60, mean reward: 20.75


 34%|███▍      | 686/2000 [2:50:54<5:34:38, 15.28s/it]

Epoch: 685, mean loss: -40.00, mean reward: 20.75


 34%|███▍      | 687/2000 [2:51:11<5:40:59, 15.58s/it]

Epoch: 686, mean loss: -34.37, mean reward: 20.50


 34%|███▍      | 688/2000 [2:51:26<5:42:03, 15.64s/it]

Epoch: 687, mean loss: -42.08, mean reward: 20.00


 34%|███▍      | 689/2000 [2:51:43<5:46:11, 15.84s/it]

Epoch: 688, mean loss: -8.27, mean reward: 21.00


 34%|███▍      | 690/2000 [2:51:59<5:47:35, 15.92s/it]

Epoch: 689, mean loss: -26.27, mean reward: 20.50


 35%|███▍      | 691/2000 [2:52:14<5:42:05, 15.68s/it]

Epoch: 690, mean loss: -20.63, mean reward: 20.75


 35%|███▍      | 692/2000 [2:52:29<5:39:29, 15.57s/it]

Epoch: 691, mean loss: -28.86, mean reward: 21.00


 35%|███▍      | 693/2000 [2:52:45<5:37:25, 15.49s/it]

Epoch: 692, mean loss: -44.55, mean reward: 20.25


 35%|███▍      | 694/2000 [2:53:00<5:37:51, 15.52s/it]

Epoch: 693, mean loss: -31.13, mean reward: 20.75


 35%|███▍      | 695/2000 [2:53:17<5:47:50, 15.99s/it]

Epoch: 694, mean loss: -26.09, mean reward: 20.50


 35%|███▍      | 696/2000 [2:53:33<5:45:28, 15.90s/it]

Epoch: 695, mean loss: -13.39, mean reward: 20.75


 35%|███▍      | 697/2000 [2:53:49<5:48:46, 16.06s/it]

Epoch: 696, mean loss: -34.41, mean reward: 20.50


 35%|███▍      | 698/2000 [2:54:05<5:47:26, 16.01s/it]

Epoch: 697, mean loss: -47.33, mean reward: 20.50


 35%|███▍      | 699/2000 [2:54:22<5:51:07, 16.19s/it]

Epoch: 698, mean loss: -41.33, mean reward: 20.25


 35%|███▌      | 700/2000 [2:54:38<5:47:21, 16.03s/it]

Epoch: 699, mean loss: -26.71, mean reward: 20.25


 35%|███▌      | 701/2000 [2:54:53<5:44:13, 15.90s/it]

Epoch: 700, mean loss: -47.95, mean reward: 20.50


 35%|███▌      | 702/2000 [2:55:09<5:41:09, 15.77s/it]

Epoch: 701, mean loss: -67.22, mean reward: 19.75


 35%|███▌      | 703/2000 [2:55:24<5:39:55, 15.72s/it]

Epoch: 702, mean loss: -41.17, mean reward: 20.50


 35%|███▌      | 704/2000 [2:55:40<5:38:56, 15.69s/it]

Epoch: 703, mean loss: -41.18, mean reward: 20.50


 35%|███▌      | 705/2000 [2:55:54<5:31:23, 15.35s/it]

Epoch: 704, mean loss: -26.21, mean reward: 20.50


 35%|███▌      | 706/2000 [2:56:10<5:30:40, 15.33s/it]

Epoch: 705, mean loss: -41.34, mean reward: 20.00


 35%|███▌      | 707/2000 [2:56:26<5:34:23, 15.52s/it]

Epoch: 706, mean loss: -37.12, mean reward: 20.75


 35%|███▌      | 708/2000 [2:56:44<5:52:52, 16.39s/it]

Epoch: 707, mean loss: -15.42, mean reward: 20.75


 35%|███▌      | 709/2000 [2:56:59<5:46:07, 16.09s/it]

Epoch: 708, mean loss: -26.04, mean reward: 20.50


 36%|███▌      | 710/2000 [2:57:16<5:51:31, 16.35s/it]

Epoch: 709, mean loss: -58.73, mean reward: 19.75


 36%|███▌      | 711/2000 [2:57:32<5:43:35, 15.99s/it]

Epoch: 710, mean loss: -17.02, mean reward: 20.75


 36%|███▌      | 712/2000 [2:57:47<5:39:54, 15.83s/it]

Epoch: 711, mean loss: -11.64, mean reward: 21.00


 36%|███▌      | 713/2000 [2:58:03<5:37:48, 15.75s/it]

Epoch: 712, mean loss: -31.81, mean reward: 21.00


 36%|███▌      | 714/2000 [2:58:19<5:43:37, 16.03s/it]

Epoch: 713, mean loss: -56.19, mean reward: 19.75


 36%|███▌      | 715/2000 [2:58:35<5:43:18, 16.03s/it]

Epoch: 714, mean loss: -33.50, mean reward: 20.75


 36%|███▌      | 716/2000 [2:58:52<5:46:43, 16.20s/it]

Epoch: 715, mean loss: -39.15, mean reward: 20.25


 36%|███▌      | 717/2000 [2:59:06<5:35:09, 15.67s/it]

Epoch: 716, mean loss: -18.30, mean reward: 20.50


 36%|███▌      | 718/2000 [2:59:23<5:39:02, 15.87s/it]

Epoch: 717, mean loss: -44.00, mean reward: 19.75


 36%|███▌      | 719/2000 [2:59:39<5:42:04, 16.02s/it]

Epoch: 718, mean loss: -24.20, mean reward: 21.00


 36%|███▌      | 720/2000 [2:59:56<5:46:18, 16.23s/it]

Epoch: 719, mean loss: -58.88, mean reward: 19.50


 36%|███▌      | 721/2000 [3:00:11<5:39:19, 15.92s/it]

Epoch: 720, mean loss: -19.72, mean reward: 20.50


 36%|███▌      | 722/2000 [3:00:27<5:41:38, 16.04s/it]

Epoch: 721, mean loss: -47.85, mean reward: 20.25


 36%|███▌      | 723/2000 [3:00:44<5:44:01, 16.16s/it]

Epoch: 722, mean loss: -23.09, mean reward: 21.00


 36%|███▌      | 724/2000 [3:01:00<5:42:10, 16.09s/it]

Epoch: 723, mean loss: -37.03, mean reward: 20.25


 36%|███▋      | 725/2000 [3:01:16<5:41:11, 16.06s/it]

Epoch: 724, mean loss: -25.54, mean reward: 20.25


 36%|███▋      | 726/2000 [3:01:30<5:28:38, 15.48s/it]

Epoch: 725, mean loss: -20.89, mean reward: 21.00


 36%|███▋      | 727/2000 [3:01:45<5:27:45, 15.45s/it]

Epoch: 726, mean loss: -38.75, mean reward: 20.75


 36%|███▋      | 728/2000 [3:02:01<5:27:46, 15.46s/it]

Epoch: 727, mean loss: -34.59, mean reward: 20.25


 36%|███▋      | 729/2000 [3:02:18<5:39:49, 16.04s/it]

Epoch: 728, mean loss: -42.53, mean reward: 20.25


 36%|███▋      | 730/2000 [3:02:35<5:44:50, 16.29s/it]

Epoch: 729, mean loss: -41.57, mean reward: 20.25


 37%|███▋      | 731/2000 [3:02:51<5:41:49, 16.16s/it]

Epoch: 730, mean loss: -29.95, mean reward: 20.75


 37%|███▋      | 732/2000 [3:03:07<5:41:06, 16.14s/it]

Epoch: 731, mean loss: -24.38, mean reward: 21.00


 37%|███▋      | 733/2000 [3:03:23<5:39:12, 16.06s/it]

Epoch: 732, mean loss: -3.94, mean reward: 20.75


 37%|███▋      | 734/2000 [3:03:39<5:38:45, 16.06s/it]

Epoch: 733, mean loss: -34.93, mean reward: 20.25


 37%|███▋      | 735/2000 [3:03:55<5:37:00, 15.98s/it]

Epoch: 734, mean loss: -4.11, mean reward: 20.75


 37%|███▋      | 736/2000 [3:04:10<5:30:51, 15.70s/it]

Epoch: 735, mean loss: -25.26, mean reward: 21.00


 37%|███▋      | 737/2000 [3:04:27<5:38:37, 16.09s/it]

Epoch: 736, mean loss: -47.47, mean reward: 19.50


 37%|███▋      | 738/2000 [3:04:43<5:43:20, 16.32s/it]

Epoch: 737, mean loss: -35.08, mean reward: 21.00


 37%|███▋      | 739/2000 [3:05:01<5:49:25, 16.63s/it]

Epoch: 738, mean loss: -27.50, mean reward: 20.50


 37%|███▋      | 740/2000 [3:05:18<5:55:12, 16.91s/it]

Epoch: 739, mean loss: -36.00, mean reward: 20.25


 37%|███▋      | 741/2000 [3:05:33<5:40:16, 16.22s/it]

Epoch: 740, mean loss: -7.74, mean reward: 21.00


 37%|███▋      | 742/2000 [3:05:49<5:36:59, 16.07s/it]

Epoch: 741, mean loss: -22.69, mean reward: 20.75


 37%|███▋      | 743/2000 [3:06:05<5:37:50, 16.13s/it]

Epoch: 742, mean loss: -43.68, mean reward: 19.75


 37%|███▋      | 744/2000 [3:06:21<5:37:31, 16.12s/it]

Epoch: 743, mean loss: -25.12, mean reward: 20.75


 37%|███▋      | 745/2000 [3:06:37<5:39:04, 16.21s/it]

Epoch: 744, mean loss: -13.31, mean reward: 20.50


 37%|███▋      | 746/2000 [3:06:56<5:50:49, 16.79s/it]

Epoch: 745, mean loss: -15.12, mean reward: 20.75


 37%|███▋      | 747/2000 [3:07:12<5:46:12, 16.58s/it]

Epoch: 746, mean loss: -27.82, mean reward: 20.75


 37%|███▋      | 748/2000 [3:07:29<5:48:23, 16.70s/it]

Epoch: 747, mean loss: -24.54, mean reward: 21.00


 37%|███▋      | 749/2000 [3:07:45<5:47:18, 16.66s/it]

Epoch: 748, mean loss: -27.20, mean reward: 21.00


 38%|███▊      | 750/2000 [3:08:01<5:41:14, 16.38s/it]

Epoch: 749, mean loss: -47.22, mean reward: 21.00


 38%|███▊      | 751/2000 [3:08:18<5:45:15, 16.59s/it]

Epoch: 750, mean loss: -36.43, mean reward: 20.75


 38%|███▊      | 752/2000 [3:08:34<5:38:59, 16.30s/it]

Epoch: 751, mean loss: -27.68, mean reward: 21.00


 38%|███▊      | 753/2000 [3:08:49<5:34:15, 16.08s/it]

Epoch: 752, mean loss: -36.29, mean reward: 20.75


 38%|███▊      | 754/2000 [3:09:05<5:31:07, 15.95s/it]

Epoch: 753, mean loss: -29.19, mean reward: 21.00


 38%|███▊      | 755/2000 [3:09:21<5:31:25, 15.97s/it]

Epoch: 754, mean loss: -44.76, mean reward: 21.00


 38%|███▊      | 756/2000 [3:09:36<5:25:59, 15.72s/it]

Epoch: 755, mean loss: -28.17, mean reward: 21.00


 38%|███▊      | 757/2000 [3:09:55<5:44:01, 16.61s/it]

Epoch: 756, mean loss: -48.89, mean reward: 20.25


 38%|███▊      | 758/2000 [3:10:15<6:04:47, 17.62s/it]

Epoch: 757, mean loss: -41.21, mean reward: 20.00


 38%|███▊      | 759/2000 [3:10:32<5:59:44, 17.39s/it]

Epoch: 758, mean loss: -36.25, mean reward: 20.25


 38%|███▊      | 760/2000 [3:10:48<5:53:46, 17.12s/it]

Epoch: 759, mean loss: -28.30, mean reward: 20.75


 38%|███▊      | 761/2000 [3:11:04<5:43:23, 16.63s/it]

Epoch: 760, mean loss: -31.67, mean reward: 21.00


 38%|███▊      | 762/2000 [3:11:18<5:30:28, 16.02s/it]

Epoch: 761, mean loss: -6.75, mean reward: 21.00


 38%|███▊      | 763/2000 [3:11:33<5:22:02, 15.62s/it]

Epoch: 762, mean loss: -44.80, mean reward: 20.75


 38%|███▊      | 764/2000 [3:11:49<5:26:19, 15.84s/it]

Epoch: 763, mean loss: -39.92, mean reward: 20.50


 38%|███▊      | 765/2000 [3:12:05<5:24:22, 15.76s/it]

Epoch: 764, mean loss: -47.31, mean reward: 20.50


 38%|███▊      | 766/2000 [3:12:19<5:14:13, 15.28s/it]

Epoch: 765, mean loss: -29.77, mean reward: 20.75


 38%|███▊      | 767/2000 [3:12:36<5:22:51, 15.71s/it]

Epoch: 766, mean loss: -67.52, mean reward: 20.50


 38%|███▊      | 768/2000 [3:12:51<5:20:58, 15.63s/it]

Epoch: 767, mean loss: -39.62, mean reward: 20.75


 38%|███▊      | 769/2000 [3:13:05<5:12:21, 15.22s/it]

Epoch: 768, mean loss: -56.07, mean reward: 20.75


 38%|███▊      | 770/2000 [3:13:20<5:11:12, 15.18s/it]

Epoch: 769, mean loss: -78.13, mean reward: 20.25


 39%|███▊      | 771/2000 [3:13:36<5:13:08, 15.29s/it]

Epoch: 770, mean loss: -51.55, mean reward: 20.75


 39%|███▊      | 772/2000 [3:13:51<5:14:00, 15.34s/it]

Epoch: 771, mean loss: -55.94, mean reward: 20.75


 39%|███▊      | 773/2000 [3:14:07<5:17:00, 15.50s/it]

Epoch: 772, mean loss: -66.68, mean reward: 21.00


 39%|███▊      | 774/2000 [3:14:23<5:17:04, 15.52s/it]

Epoch: 773, mean loss: -70.91, mean reward: 20.75


 39%|███▉      | 775/2000 [3:14:38<5:17:19, 15.54s/it]

Epoch: 774, mean loss: -69.42, mean reward: 20.50


 39%|███▉      | 776/2000 [3:14:53<5:12:45, 15.33s/it]

Epoch: 775, mean loss: -72.40, mean reward: 21.00


 39%|███▉      | 777/2000 [3:15:09<5:14:45, 15.44s/it]

Epoch: 776, mean loss: -89.27, mean reward: 20.50


 39%|███▉      | 778/2000 [3:15:23<5:08:14, 15.13s/it]

Epoch: 777, mean loss: -57.11, mean reward: 20.50


 39%|███▉      | 779/2000 [3:15:39<5:08:50, 15.18s/it]

Epoch: 778, mean loss: -64.52, mean reward: 20.25


 39%|███▉      | 780/2000 [3:15:53<5:05:38, 15.03s/it]

Epoch: 779, mean loss: -40.49, mean reward: 20.50


 39%|███▉      | 781/2000 [3:16:08<5:04:16, 14.98s/it]

Epoch: 780, mean loss: -43.82, mean reward: 20.75


 39%|███▉      | 782/2000 [3:16:23<5:04:37, 15.01s/it]

Epoch: 781, mean loss: -83.34, mean reward: 20.00


 39%|███▉      | 783/2000 [3:16:38<5:04:19, 15.00s/it]

Epoch: 782, mean loss: -47.21, mean reward: 20.00


 39%|███▉      | 784/2000 [3:16:53<5:01:50, 14.89s/it]

Epoch: 783, mean loss: -59.88, mean reward: 20.25


 39%|███▉      | 785/2000 [3:17:08<5:03:31, 14.99s/it]

Epoch: 784, mean loss: -46.16, mean reward: 20.75


 39%|███▉      | 786/2000 [3:17:23<5:03:26, 15.00s/it]

Epoch: 785, mean loss: -61.51, mean reward: 20.75


 39%|███▉      | 787/2000 [3:17:38<5:00:43, 14.87s/it]

Epoch: 786, mean loss: -88.55, mean reward: 20.75


 39%|███▉      | 788/2000 [3:17:53<5:01:29, 14.93s/it]

Epoch: 787, mean loss: -90.73, mean reward: 20.50


 39%|███▉      | 789/2000 [3:18:08<5:04:23, 15.08s/it]

Epoch: 788, mean loss: -60.55, mean reward: 20.75


 40%|███▉      | 790/2000 [3:18:23<5:04:27, 15.10s/it]

Epoch: 789, mean loss: -44.96, mean reward: 20.50


 40%|███▉      | 791/2000 [3:18:39<5:07:43, 15.27s/it]

Epoch: 790, mean loss: -73.89, mean reward: 20.50


 40%|███▉      | 792/2000 [3:18:53<5:02:14, 15.01s/it]

Epoch: 791, mean loss: -28.23, mean reward: 21.00


 40%|███▉      | 793/2000 [3:19:09<5:03:47, 15.10s/it]

Epoch: 792, mean loss: -66.32, mean reward: 20.00


 40%|███▉      | 794/2000 [3:19:24<5:03:49, 15.12s/it]

Epoch: 793, mean loss: -52.24, mean reward: 20.75


 40%|███▉      | 795/2000 [3:19:40<5:06:38, 15.27s/it]

Epoch: 794, mean loss: -76.91, mean reward: 19.50


 40%|███▉      | 796/2000 [3:19:54<5:03:08, 15.11s/it]

Epoch: 795, mean loss: -45.65, mean reward: 20.75


 40%|███▉      | 797/2000 [3:20:09<5:02:20, 15.08s/it]

Epoch: 796, mean loss: -56.27, mean reward: 20.00


 40%|███▉      | 798/2000 [3:20:24<5:00:08, 14.98s/it]

Epoch: 797, mean loss: -53.86, mean reward: 20.75


 40%|███▉      | 799/2000 [3:20:39<5:00:01, 14.99s/it]

Epoch: 798, mean loss: -42.43, mean reward: 20.75


 40%|████      | 800/2000 [3:20:55<5:02:31, 15.13s/it]

Epoch: 799, mean loss: -52.02, mean reward: 20.50


 40%|████      | 801/2000 [3:21:09<4:59:14, 14.97s/it]

Epoch: 800, mean loss: -37.07, mean reward: 20.75


 40%|████      | 802/2000 [3:21:23<4:53:18, 14.69s/it]

Epoch: 801, mean loss: -49.99, mean reward: 20.50


 40%|████      | 803/2000 [3:21:40<5:07:48, 15.43s/it]

Epoch: 802, mean loss: -57.57, mean reward: 20.25


 40%|████      | 803/2000 [3:21:43<5:00:42, 15.07s/it]


KeyboardInterrupt: 

In [9]:
torch.save(policy.state_dict(), "pong_policy.pth")

In [10]:
def create_gameplay_gif(policy,env,file_name="pong_gameplay.gif"):
    UP = 2
    DOWN = 3
    policy.eval()
    observation, _ = env.reset()    
    frames = [observation]
    prev_state = None
    state = preprocess_frame(observation)
    while True:
        if prev_state is not None:
            change = state - prev_state
        else:
            change = state
        prob = policy(change).squeeze(0)
        action = UP if random.random() < prob.item() else DOWN
        observation, reward, terminated, truncated, _ = env.step(action)
        prev_state = state
        state = preprocess_frame(observation)
        frames.append(observation)
        if terminated or truncated:
            break
    imageio.mimsave(file_name, frames, fps=30)

In [11]:
create_gameplay_gif(policy,env,"pong_gameplay1.gif")

In [9]:
env.close()