## Training and evaluation of decision transformer

---

> Internship neural networks
>
> Group 4: Reinforcement learning
>
> Deadline 28.02.23 23:59

---

In [1]:
import csv
from datetime import datetime
import numpy as np
import os
import random
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [2]:
%run "../Environment/Connect4.ipynb"
%run "../utils/utils.ipynb"
%run "../OtherAgents/Agents.ipynb"
%run "utils.ipynb"
%run "DecisionTransformer.ipynb"

# Training parameters and evaluation device

The used hyperparameters in this notebook are examples for the code submission. In our paper we present the results for different parameters.

In [3]:
rtg_target = 10            # target return to go (decision transformer looks for trajectories with this rtg)
env_name = 'Connect4'      # name of environment  
vocab_size = 7             # number of actions
state_dim = 42             # dimension of the observations
act_dim = 1                # dimension of the actions
max_timestep=21            # maximum length of a game

num_eval_ep = 100           # num of evaluation episodes per iteration

batch_size = 64            # training batch size
lr = 1e-4                   # learning rate
wt_decay = 1e-4             # weight decay
warmup_steps = 10000        # warmup steps for lr scheduler

# total updates of dt = max_train_iters x num_updates_per_iter
num_updates_per_iter = 1000

context_len = 10        # K in decision transformer
n_blocks = 4            # num of transformer blocks
hidden_dim = 128         # hidden dim of transformer
n_heads = 2             # num of transformer heads
dropout_p = 0.1         # dropout probability

# saves model and csv in this directory
log_dir = "./dt_training/"

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# evaluation device
device_name = 'cuda'
device = torch.device(device_name)
print("device set to: ", device)

device set to:  cuda


# Create evaluation file + path

In [4]:
start_time = datetime.now().replace(microsecond=0)

start_time_str = start_time.strftime("%y-%m-%d-%H-%M-%S")

prefix = "dt_" + env_name + "_batch_size=" + str(batch_size) + "_context_len=" + str(context_len) + "_n_blocks=" + str(n_blocks) + "_hidden_dim=" + str(hidden_dim) + "_n_heads=" + str(n_heads)

save_model_name =  prefix + "_model_" + start_time_str + ".pt"
save_model_path = os.path.join(log_dir, save_model_name)
save_best_model_path = save_model_path[:-3] + "_best.pt"
save_best_model_path_NM = save_model_path[:-3] + "_best_againstNM.pt"

log_name = prefix + "_log_" + start_time_str + ".csv"
log_path = os.path.join(log_dir, log_name)


csv_writer = csv.writer(open(log_path, 'a', 1))
csv_header = (["duration", "num_updates", "action_loss", 
               "eval_avg_reward", "eval_avg_ep_len", "eval_win_rate",
               "eval_avg_rewardP2", "eval_avg_ep_lenP2", "eval_win_rateP2",
               "eval_avg_rewardNM", "eval_avg_ep_lenNM", "eval_win_rateNM",
               "eval_avg_rewardP2NM", "eval_avg_ep_lenP2NM", "eval_win_rateP2NM"])

csv_writer.writerow(csv_header)


print("=" * 60)
print("start time: " + start_time_str)
print("=" * 60)

print("device set to: " + str(device))
print("model save path: " + save_model_path)
print("log csv save path: " + log_path)

max_avg_reward = -1.0
max_avg_reward_NM = -1.0
total_updates = 0

start time: 23-02-28-17-54-34
device set to: cuda
model save path: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34.pt
log csv save path: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_log_23-02-28-17-54-34.csv


# Define agent and optimizer

In [5]:
torch.manual_seed(42)
random.seed(42)

agent = DTAgent(state_dim=state_dim,
            act_dim=act_dim,
            n_blocks=n_blocks,
            hidden_dim=hidden_dim,
            context_len=context_len,
            n_heads=n_heads,
            drop_p=dropout_p,
            rtg_target = rtg_target,
            vocab_size = vocab_size)

optimizer = torch.optim.AdamW(agent.model.parameters(), lr=lr, weight_decay=wt_decay)

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda steps: min((steps+1)/warmup_steps, 1))

env = Connect4()

randomPlayer = RandomAgent()
negaMax = NegaMaxAgent(env, 4)

# Training and evaluation

In [6]:
datasets = [read_list('Dataset/RandomAgentVsRandomAgentDatasetWithoutDups'),
            read_list('Dataset/NegaMaxAgentVsRandomAgentDatasetWithoutDups'), 
            read_list('Dataset/NegaMaxAgentVsNegaMaxAgentDatasetWithoutDups')]

max_train_iters = [20, 30, 30]

# start training
for i in range(len(datasets)):
    # update dataset
    traj_dataset = TrajectoryDataset(datasets[i], context_len)
    
    # Create DataLoader for dataset
    traj_data_loader = DataLoader(traj_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            pin_memory=True,
                            drop_last=True) 

    data_iter = iter(traj_data_loader)

    for i_train_iter in tqdm(range(max_train_iters[i])):

        log_action_losses = []
        agent.model.train()

        for _ in range(num_updates_per_iter):
            try:
                timesteps, states, actions, returns_to_go, traj_mask = next(data_iter)
            except StopIteration:
                data_iter = iter(traj_data_loader)
                timesteps, states, actions, returns_to_go, traj_mask = next(data_iter)

            timesteps = timesteps.to(device)    # batch_size x traj_length
            states = states.to(device)          # batch_size x traj_length x state_dim
            actions = actions.to(device)        # batch_size x traj_length x act_dim
            returns_to_go = returns_to_go.to(device).unsqueeze(dim=-1) # batch_size x traj_length x 1
            traj_mask = traj_mask.to(device)    # B x T

            _, loss = agent.model.forward(timesteps=timesteps, 
                                                            states=states, 
                                                            actions=actions, 
                                                            returns_to_go=returns_to_go,
                                                            traj_mask = traj_mask)
            
            # optimize loss
            optimizer.zero_grad()
            loss.requires_grad_()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(agent.model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            log_action_losses.append(loss.detach().cpu().item())

        # evaluate on env
        results = evaluate_on_env(agent, randomPlayer, "p1", env, num_eval_ep, max_timestep)

        resultsPlayer2 = evaluate_on_env(agent, randomPlayer, "p2", env, num_eval_ep, max_timestep)

        resultsAgainstNM = evaluate_on_env(agent, negaMax, "p1", env, num_eval_ep, max_timestep)

        resultsPlayer2AgainstNM = evaluate_on_env(agent, negaMax, "p2", env, num_eval_ep, max_timestep)

        # get the results from the evaluation
        eval_avg_reward = results['avg_reward']
        eval_avg_ep_len = results['avg_ep_len']
        eval_win_rate = results['win_rate']

        eval_avg_reward_player2 = resultsPlayer2['avg_reward']
        eval_avg_ep_len_player2 = resultsPlayer2['avg_ep_len']
        eval_win_rate_player2 = resultsPlayer2['win_rate']

        eval_avg_reward_AgainstNM = resultsAgainstNM['avg_reward']
        eval_avg_ep_len_AgainstNM = resultsAgainstNM['avg_ep_len']
        eval_win_rate_AgainstNM = resultsAgainstNM['win_rate']

        eval_avg_reward_player2_AgainstNM = resultsPlayer2AgainstNM['avg_reward']
        eval_avg_ep_len_player2_AgainstNM = resultsPlayer2AgainstNM['avg_ep_len']
        eval_win_rate_player2_AgainstNM = resultsPlayer2AgainstNM['win_rate']


        mean_action_loss = np.mean(log_action_losses)
        time_elapsed = str(datetime.now().replace(microsecond=0) - start_time)

        total_updates += num_updates_per_iter
        
        # print results
        log_str = ("=" * 60 + '\n' +
                "time elapsed: " + time_elapsed  + '\n' +
                "num of updates: " + str(total_updates) + '\n' +
                "action loss: " +  format(mean_action_loss, ".5f") + '\n' +
                "eval avg reward: " + format(eval_avg_reward, ".5f") + '\n' +
                "eval avg ep len: " + format(eval_avg_ep_len, ".5f") + '\n' +
                "eval_win_rate: " + format(eval_win_rate, ".2f") + '\n'
                "eval avg reward as player2: " + format(eval_avg_reward_player2, ".5f") + '\n' +
                "eval avg ep len as player2: " + format(eval_avg_ep_len_player2, ".5f") + '\n' +
                "eval_win_rate as player2: " + format(eval_win_rate_player2, ".2f") + '\n' +
                "eval avg reward against NM: " + format(eval_avg_reward_AgainstNM, ".5f") + '\n' +
                "eval avg ep len against NM: " + format(eval_avg_ep_len_AgainstNM, ".5f") + '\n' +
                "eval_win_rate against NM: " + format(eval_win_rate_AgainstNM, ".2f") + '\n' + 
                "eval avg reward as player2 against NM: " + format(eval_avg_reward_player2_AgainstNM, ".5f") + '\n' +
                "eval avg ep len as player2 against NM: " + format(eval_avg_ep_len_player2_AgainstNM, ".5f") + '\n' +
                "eval_win_rate as player2 against NM: " + format(eval_win_rate_player2_AgainstNM, ".2f"))

        print(log_str)
        
        # save results in csv
        log_data = [time_elapsed, total_updates, mean_action_loss,
                    eval_avg_reward, eval_avg_ep_len, 
                    eval_win_rate, eval_avg_reward_player2,
                    eval_avg_ep_len_player2, 
                    eval_win_rate_player2,
                    eval_avg_reward_AgainstNM, eval_avg_ep_len_AgainstNM,
                    eval_win_rate_AgainstNM,
                    eval_avg_reward_player2_AgainstNM, eval_avg_ep_len_player2_AgainstNM,
                    eval_win_rate_player2_AgainstNM]

        csv_writer.writerow(log_data)

        # save best model against random
        print("max avg reward: " + format(max_avg_reward, ".5f"))
        if eval_avg_reward >= max_avg_reward:
            print("saving max avg reward model at: " + save_best_model_path)
            torch.save(agent.model.state_dict(), save_best_model_path)
            max_avg_reward = eval_avg_reward
        # save best model against negaMax
        if eval_avg_reward_AgainstNM >= max_avg_reward_NM:
            print("saving max avg reward model at: " + save_best_model_path_NM)
            torch.save(agent.model.state_dict(), save_best_model_path_NM)
            max_avg_reward_NM = eval_avg_reward_AgainstNM

    
    currentModelPath = save_model_path[:-3] + "_" + str(total_updates) + ".pt"
    print("saving current model at: " + currentModelPath)
    torch.save(agent.model.state_dict(), currentModelPath)


final_model_path = save_model_path      
torch.save(agent.model.state_dict(), final_model_path)
                                             
print("=" * 60)
print("finished training!")
print("=" * 60)
end_time = datetime.now().replace(microsecond=0)
time_elapsed = str(end_time - start_time)
end_time_str = end_time.strftime("%y-%m-%d-%H-%M-%S")
print("started training at: " + start_time_str)
print("finished training at: " + end_time_str)
print("total training time: " + time_elapsed)
print("max avg reward: " + format(max_avg_reward, ".5f"))
print("saved max avg reward model at: " + save_best_model_path)
print("saved last updated model at: " + save_model_path)
print("=" * 60)

csv_writer = open(log_path, 'a', 1).close()

  5%|██                                        | 1/20 [01:54<36:14, 114.46s/it]

time elapsed: 0:02:03
num of updates: 1000
action loss: 1.98899
eval avg reward: 4.20000
eval avg ep len: 8.32000
eval_win_rate: 0.71
eval avg reward as player2: 4.60000
eval avg ep len as player2: 8.97000
eval_win_rate as player2: 0.73
eval avg reward against NM: -10.00000
eval avg ep len against NM: 10.52000
eval_win_rate against NM: 0.00
eval avg reward as player2 against NM: -8.20000
eval avg ep len as player2 against NM: 8.87000
eval_win_rate as player2 against NM: 0.09
max avg reward: -1.00000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt


 10%|████▏                                     | 2/20 [03:34<31:43, 105.73s/it]

time elapsed: 0:03:42
num of updates: 2000
action loss: 1.95200
eval avg reward: 8.00000
eval avg ep len: 7.38000
eval_win_rate: 0.90
eval avg reward as player2: 7.40000
eval avg ep len as player2: 6.23000
eval_win_rate as player2: 0.87
eval avg reward against NM: -4.80000
eval avg ep len against NM: 9.97000
eval_win_rate against NM: 0.26
eval avg reward as player2 against NM: -9.40000
eval avg ep len as player2 against NM: 6.83000
eval_win_rate as player2 against NM: 0.03
max avg reward: 4.20000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt


 15%|██████▎                                   | 3/20 [05:21<30:11, 106.54s/it]

time elapsed: 0:05:30
num of updates: 3000
action loss: 1.94827
eval avg reward: 6.20000
eval avg ep len: 9.69000
eval_win_rate: 0.81
eval avg reward as player2: 6.60000
eval avg ep len as player2: 7.58000
eval_win_rate as player2: 0.83
eval avg reward against NM: -6.20000
eval avg ep len against NM: 9.34000
eval_win_rate against NM: 0.19
eval avg reward as player2 against NM: -9.60000
eval avg ep len as player2 against NM: 6.48000
eval_win_rate as player2 against NM: 0.02
max avg reward: 8.00000


 20%|████████▍                                 | 4/20 [06:59<27:31, 103.23s/it]

time elapsed: 0:07:08
num of updates: 4000
action loss: 1.94685
eval avg reward: 7.60000
eval avg ep len: 6.30000
eval_win_rate: 0.88
eval avg reward as player2: 7.20000
eval avg ep len as player2: 6.33000
eval_win_rate as player2: 0.86
eval avg reward against NM: -10.00000
eval avg ep len against NM: 10.68000
eval_win_rate against NM: 0.00
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 6.36000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.00000


 25%|██████████▌                               | 5/20 [08:44<25:54, 103.64s/it]

time elapsed: 0:08:52
num of updates: 5000
action loss: 1.94612
eval avg reward: 7.60000
eval avg ep len: 7.13000
eval_win_rate: 0.88
eval avg reward as player2: 6.80000
eval avg ep len as player2: 6.05000
eval_win_rate as player2: 0.84
eval avg reward against NM: -9.40000
eval avg ep len against NM: 10.01000
eval_win_rate against NM: 0.03
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 7.29000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.00000


 30%|████████████▌                             | 6/20 [10:31<24:26, 104.77s/it]

time elapsed: 0:10:39
num of updates: 6000
action loss: 1.94528
eval avg reward: 7.20000
eval avg ep len: 8.46000
eval_win_rate: 0.86
eval avg reward as player2: 5.20000
eval avg ep len as player2: 8.20000
eval_win_rate as player2: 0.76
eval avg reward against NM: -5.60000
eval avg ep len against NM: 9.57000
eval_win_rate against NM: 0.22
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 7.95000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.00000


 35%|██████████████▋                           | 7/20 [12:21<23:04, 106.51s/it]

time elapsed: 0:12:29
num of updates: 7000
action loss: 1.94484
eval avg reward: 6.40000
eval avg ep len: 8.71000
eval_win_rate: 0.82
eval avg reward as player2: 5.20000
eval avg ep len as player2: 7.88000
eval_win_rate as player2: 0.76
eval avg reward against NM: -10.00000
eval avg ep len against NM: 9.07000
eval_win_rate against NM: 0.00
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 7.57000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.00000


 40%|████████████████▊                         | 8/20 [14:02<20:58, 104.86s/it]

time elapsed: 0:14:11
num of updates: 8000
action loss: 1.94445
eval avg reward: 7.00000
eval avg ep len: 7.13000
eval_win_rate: 0.85
eval avg reward as player2: 7.40000
eval avg ep len as player2: 5.90000
eval_win_rate as player2: 0.87
eval avg reward against NM: -4.00000
eval avg ep len against NM: 11.29000
eval_win_rate against NM: 0.30
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 7.04000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.00000


 45%|██████████████████▉                       | 9/20 [15:44<19:04, 104.06s/it]

time elapsed: 0:15:53
num of updates: 9000
action loss: 1.94409
eval avg reward: 8.80000
eval avg ep len: 6.92000
eval_win_rate: 0.94
eval avg reward as player2: 5.60000
eval avg ep len as player2: 7.69000
eval_win_rate as player2: 0.78
eval avg reward against NM: -10.00000
eval avg ep len against NM: 8.00000
eval_win_rate against NM: 0.00
eval avg reward as player2 against NM: -9.20000
eval avg ep len as player2 against NM: 6.70000
eval_win_rate as player2 against NM: 0.04
max avg reward: 8.00000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt


 50%|████████████████████▌                    | 10/20 [17:34<17:38, 105.84s/it]

time elapsed: 0:17:43
num of updates: 10000
action loss: 1.94337
eval avg reward: 4.60000
eval avg ep len: 9.05000
eval_win_rate: 0.73
eval avg reward as player2: 5.60000
eval avg ep len as player2: 8.32000
eval_win_rate as player2: 0.78
eval avg reward against NM: -6.40000
eval avg ep len against NM: 10.66000
eval_win_rate against NM: 0.18
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 7.55000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.80000


 55%|██████████████████████▌                  | 11/20 [19:34<16:30, 110.10s/it]

time elapsed: 0:19:42
num of updates: 11000
action loss: 1.94299
eval avg reward: 2.00000
eval avg ep len: 10.31000
eval_win_rate: 0.60
eval avg reward as player2: 0.40000
eval avg ep len as player2: 10.20000
eval_win_rate as player2: 0.52
eval avg reward against NM: -5.20000
eval avg ep len against NM: 13.36000
eval_win_rate against NM: 0.24
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 7.50000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.80000


 60%|████████████████████████▌                | 12/20 [21:32<15:00, 112.51s/it]

time elapsed: 0:21:41
num of updates: 12000
action loss: 1.94262
eval avg reward: 3.40000
eval avg ep len: 10.67000
eval_win_rate: 0.67
eval avg reward as player2: 1.40000
eval avg ep len as player2: 10.46000
eval_win_rate as player2: 0.57
eval avg reward against NM: -2.40000
eval avg ep len against NM: 11.13000
eval_win_rate against NM: 0.38
eval avg reward as player2 against NM: -9.40000
eval avg ep len as player2 against NM: 8.07000
eval_win_rate as player2 against NM: 0.03
max avg reward: 8.80000


 65%|██████████████████████████▋              | 13/20 [23:17<12:52, 110.29s/it]

time elapsed: 0:23:26
num of updates: 13000
action loss: 1.94208
eval avg reward: 4.80000
eval avg ep len: 9.92000
eval_win_rate: 0.74
eval avg reward as player2: 2.00000
eval avg ep len as player2: 9.94000
eval_win_rate as player2: 0.60
eval avg reward against NM: -5.60000
eval avg ep len against NM: 8.54000
eval_win_rate against NM: 0.22
eval avg reward as player2 against NM: -9.60000
eval avg ep len as player2 against NM: 7.62000
eval_win_rate as player2 against NM: 0.02
max avg reward: 8.80000


 70%|████████████████████████████▋            | 14/20 [24:56<10:41, 106.90s/it]

time elapsed: 0:25:05
num of updates: 14000
action loss: 1.94181
eval avg reward: 5.00000
eval avg ep len: 8.27000
eval_win_rate: 0.75
eval avg reward as player2: 5.40000
eval avg ep len as player2: 7.56000
eval_win_rate as player2: 0.77
eval avg reward against NM: -10.00000
eval avg ep len against NM: 8.50000
eval_win_rate against NM: 0.00
eval avg reward as player2 against NM: -9.80000
eval avg ep len as player2 against NM: 6.30000
eval_win_rate as player2 against NM: 0.01
max avg reward: 8.80000


 75%|██████████████████████████████▊          | 15/20 [26:39<08:47, 105.53s/it]

time elapsed: 0:26:47
num of updates: 15000
action loss: 1.94109
eval avg reward: 5.20000
eval avg ep len: 7.93000
eval_win_rate: 0.76
eval avg reward as player2: 3.80000
eval avg ep len as player2: 8.25000
eval_win_rate as player2: 0.69
eval avg reward against NM: -10.00000
eval avg ep len against NM: 7.54000
eval_win_rate against NM: 0.00
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 6.84000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.80000


 80%|████████████████████████████████▊        | 16/20 [28:17<06:53, 103.29s/it]

time elapsed: 0:28:25
num of updates: 16000
action loss: 1.94046
eval avg reward: 2.60000
eval avg ep len: 9.77000
eval_win_rate: 0.63
eval avg reward as player2: 5.20000
eval avg ep len as player2: 8.41000
eval_win_rate as player2: 0.76
eval avg reward against NM: -10.00000
eval avg ep len against NM: 6.00000
eval_win_rate against NM: 0.00
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 6.48000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.80000


 85%|██████████████████████████████████▊      | 17/20 [29:56<05:06, 102.19s/it]

time elapsed: 0:30:05
num of updates: 17000
action loss: 1.94045
eval avg reward: 6.20000
eval avg ep len: 8.61000
eval_win_rate: 0.81
eval avg reward as player2: 5.60000
eval avg ep len as player2: 7.12000
eval_win_rate as player2: 0.78
eval avg reward against NM: -8.80000
eval avg ep len against NM: 9.47000
eval_win_rate against NM: 0.06
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 6.26000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.80000


 90%|████████████████████████████████████▉    | 18/20 [31:41<03:26, 103.11s/it]

time elapsed: 0:31:50
num of updates: 18000
action loss: 1.93986
eval avg reward: 6.80000
eval avg ep len: 8.17000
eval_win_rate: 0.84
eval avg reward as player2: 5.00000
eval avg ep len as player2: 8.13000
eval_win_rate as player2: 0.75
eval avg reward against NM: -9.80000
eval avg ep len against NM: 8.35000
eval_win_rate against NM: 0.01
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 6.81000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.80000


 95%|██████████████████████████████████████▉  | 19/20 [33:16<01:40, 100.49s/it]

time elapsed: 0:33:24
num of updates: 19000
action loss: 1.93921
eval avg reward: 7.60000
eval avg ep len: 8.60000
eval_win_rate: 0.88
eval avg reward as player2: 5.80000
eval avg ep len as player2: 7.42000
eval_win_rate as player2: 0.79
eval avg reward against NM: -9.80000
eval avg ep len against NM: 7.50000
eval_win_rate against NM: 0.01
eval avg reward as player2 against NM: -9.70000
eval avg ep len as player2 against NM: 6.91000
eval_win_rate as player2 against NM: 0.03
max avg reward: 8.80000


100%|█████████████████████████████████████████| 20/20 [35:00<00:00, 105.03s/it]

time elapsed: 0:35:09
num of updates: 20000
action loss: 1.93874
eval avg reward: 7.60000
eval avg ep len: 6.74000
eval_win_rate: 0.88
eval avg reward as player2: 6.40000
eval avg ep len as player2: 6.56000
eval_win_rate as player2: 0.82
eval avg reward against NM: -10.00000
eval avg ep len against NM: 8.00000
eval_win_rate against NM: 0.00
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 7.41000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.80000
saving current model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_20000.pt



  3%|█▍                                        | 1/30 [01:52<54:23, 112.54s/it]

time elapsed: 0:37:02
num of updates: 21000
action loss: 1.72372
eval avg reward: 6.60000
eval avg ep len: 8.44000
eval_win_rate: 0.83
eval avg reward as player2: 6.60000
eval avg ep len as player2: 7.53000
eval_win_rate as player2: 0.83
eval avg reward against NM: -7.80000
eval avg ep len against NM: 9.73000
eval_win_rate against NM: 0.11
eval avg reward as player2 against NM: -9.80000
eval avg ep len as player2 against NM: 7.50000
eval_win_rate as player2 against NM: 0.01
max avg reward: 8.80000


  7%|██▊                                       | 2/30 [03:44<52:28, 112.43s/it]

time elapsed: 0:38:54
num of updates: 22000
action loss: 1.51932
eval avg reward: 7.20000
eval avg ep len: 7.66000
eval_win_rate: 0.86
eval avg reward as player2: 6.60000
eval avg ep len as player2: 7.96000
eval_win_rate as player2: 0.83
eval avg reward against NM: -6.60000
eval avg ep len against NM: 9.56000
eval_win_rate against NM: 0.17
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 9.68000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.80000


 10%|████▏                                     | 3/30 [05:47<52:45, 117.25s/it]

time elapsed: 0:40:57
num of updates: 23000
action loss: 1.38264
eval avg reward: 7.60000
eval avg ep len: 7.81000
eval_win_rate: 0.88
eval avg reward as player2: 8.40000
eval avg ep len as player2: 7.41000
eval_win_rate as player2: 0.92
eval avg reward against NM: -9.40000
eval avg ep len against NM: 9.86000
eval_win_rate against NM: 0.03
eval avg reward as player2 against NM: -9.00000
eval avg ep len as player2 against NM: 11.46000
eval_win_rate as player2 against NM: 0.05
max avg reward: 8.80000


 13%|█████▌                                    | 4/30 [07:51<51:49, 119.59s/it]

time elapsed: 0:43:00
num of updates: 24000
action loss: 1.28816
eval avg reward: 8.00000
eval avg ep len: 8.16000
eval_win_rate: 0.90
eval avg reward as player2: 8.40000
eval avg ep len as player2: 7.50000
eval_win_rate as player2: 0.92
eval avg reward against NM: -7.20000
eval avg ep len against NM: 10.04000
eval_win_rate against NM: 0.14
eval avg reward as player2 against NM: -9.80000
eval avg ep len as player2 against NM: 9.86000
eval_win_rate as player2 against NM: 0.01
max avg reward: 8.80000


 17%|███████                                   | 5/30 [09:55<50:31, 121.26s/it]

time elapsed: 0:45:05
num of updates: 25000
action loss: 1.21891
eval avg reward: 8.40000
eval avg ep len: 8.00000
eval_win_rate: 0.92
eval avg reward as player2: 8.60000
eval avg ep len as player2: 7.24000
eval_win_rate as player2: 0.93
eval avg reward against NM: -8.60000
eval avg ep len against NM: 11.59000
eval_win_rate against NM: 0.07
eval avg reward as player2 against NM: -9.80000
eval avg ep len as player2 against NM: 9.71000
eval_win_rate as player2 against NM: 0.01
max avg reward: 8.80000


 20%|████████▍                                 | 6/30 [12:00<49:03, 122.66s/it]

time elapsed: 0:47:10
num of updates: 26000
action loss: 1.16735
eval avg reward: 8.40000
eval avg ep len: 7.19000
eval_win_rate: 0.92
eval avg reward as player2: 9.00000
eval avg ep len as player2: 6.88000
eval_win_rate as player2: 0.95
eval avg reward against NM: -9.20000
eval avg ep len against NM: 11.81000
eval_win_rate against NM: 0.04
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 10.23000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.80000


 23%|█████████▊                                | 7/30 [14:05<47:17, 123.39s/it]

time elapsed: 0:49:15
num of updates: 27000
action loss: 1.12619
eval avg reward: 8.80000
eval avg ep len: 8.06000
eval_win_rate: 0.94
eval avg reward as player2: 8.60000
eval avg ep len as player2: 7.01000
eval_win_rate as player2: 0.93
eval avg reward against NM: -7.00000
eval avg ep len against NM: 9.93000
eval_win_rate against NM: 0.15
eval avg reward as player2 against NM: -10.00000
eval avg ep len as player2 against NM: 10.82000
eval_win_rate as player2 against NM: 0.00
max avg reward: 8.80000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt


 27%|███████████▏                              | 8/30 [16:09<45:20, 123.67s/it]

time elapsed: 0:51:19
num of updates: 28000
action loss: 1.09437
eval avg reward: 9.40000
eval avg ep len: 7.57000
eval_win_rate: 0.97
eval avg reward as player2: 9.20000
eval avg ep len as player2: 6.68000
eval_win_rate as player2: 0.96
eval avg reward against NM: 0.60000
eval avg ep len against NM: 9.73000
eval_win_rate against NM: 0.53
eval avg reward as player2 against NM: -9.80000
eval avg ep len as player2 against NM: 10.58000
eval_win_rate as player2 against NM: 0.01
max avg reward: 8.80000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best_againstNM.pt


 30%|████████████▌                             | 9/30 [18:11<43:01, 122.95s/it]

time elapsed: 0:53:20
num of updates: 29000
action loss: 1.06794
eval avg reward: 8.60000
eval avg ep len: 7.94000
eval_win_rate: 0.93
eval avg reward as player2: 9.20000
eval avg ep len as player2: 7.11000
eval_win_rate as player2: 0.96
eval avg reward against NM: 2.60000
eval avg ep len against NM: 9.50000
eval_win_rate against NM: 0.63
eval avg reward as player2 against NM: -9.40000
eval avg ep len as player2 against NM: 10.13000
eval_win_rate as player2 against NM: 0.04
max avg reward: 9.40000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best_againstNM.pt


 33%|█████████████▋                           | 10/30 [20:27<42:21, 127.10s/it]

time elapsed: 0:55:37
num of updates: 30000
action loss: 1.04736
eval avg reward: 8.80000
eval avg ep len: 7.24000
eval_win_rate: 0.94
eval avg reward as player2: 8.80000
eval avg ep len as player2: 7.37000
eval_win_rate as player2: 0.94
eval avg reward against NM: -7.00000
eval avg ep len against NM: 12.16000
eval_win_rate against NM: 0.15
eval avg reward as player2 against NM: -9.80000
eval avg ep len as player2 against NM: 10.40000
eval_win_rate as player2 against NM: 0.01
max avg reward: 9.40000


 37%|███████████████                          | 11/30 [22:36<40:28, 127.79s/it]

time elapsed: 0:57:46
num of updates: 31000
action loss: 1.02589
eval avg reward: 9.40000
eval avg ep len: 7.81000
eval_win_rate: 0.97
eval avg reward as player2: 9.60000
eval avg ep len as player2: 6.88000
eval_win_rate as player2: 0.98
eval avg reward against NM: -6.80000
eval avg ep len against NM: 10.42000
eval_win_rate against NM: 0.16
eval avg reward as player2 against NM: -9.00000
eval avg ep len as player2 against NM: 12.55000
eval_win_rate as player2 against NM: 0.05
max avg reward: 9.40000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt


 40%|████████████████▍                        | 12/30 [24:51<38:55, 129.77s/it]

time elapsed: 1:00:01
num of updates: 32000
action loss: 1.01185
eval avg reward: 9.20000
eval avg ep len: 8.08000
eval_win_rate: 0.96
eval avg reward as player2: 9.20000
eval avg ep len as player2: 6.89000
eval_win_rate as player2: 0.96
eval avg reward against NM: -6.80000
eval avg ep len against NM: 13.36000
eval_win_rate against NM: 0.16
eval avg reward as player2 against NM: -9.60000
eval avg ep len as player2 against NM: 10.65000
eval_win_rate as player2 against NM: 0.02
max avg reward: 9.40000


 43%|█████████████████▊                       | 13/30 [27:05<37:10, 131.18s/it]

time elapsed: 1:02:15
num of updates: 33000
action loss: 0.99330
eval avg reward: 9.40000
eval avg ep len: 7.41000
eval_win_rate: 0.97
eval avg reward as player2: 9.00000
eval avg ep len as player2: 6.94000
eval_win_rate as player2: 0.95
eval avg reward against NM: -7.20000
eval avg ep len against NM: 11.06000
eval_win_rate against NM: 0.14
eval avg reward as player2 against NM: -9.80000
eval avg ep len as player2 against NM: 11.00000
eval_win_rate as player2 against NM: 0.01
max avg reward: 9.40000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt


 47%|███████████████████▏                     | 14/30 [29:15<34:53, 130.82s/it]

time elapsed: 1:04:25
num of updates: 34000
action loss: 0.97882
eval avg reward: 9.60000
eval avg ep len: 7.34000
eval_win_rate: 0.98
eval avg reward as player2: 9.20000
eval avg ep len as player2: 7.16000
eval_win_rate as player2: 0.96
eval avg reward against NM: -1.20000
eval avg ep len against NM: 10.01000
eval_win_rate against NM: 0.44
eval avg reward as player2 against NM: -9.30000
eval avg ep len as player2 against NM: 11.11000
eval_win_rate as player2 against NM: 0.04
max avg reward: 9.40000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt


 50%|████████████████████▌                    | 15/30 [31:41<33:51, 135.47s/it]

time elapsed: 1:06:51
num of updates: 35000
action loss: 0.96749
eval avg reward: 9.00000
eval avg ep len: 7.73000
eval_win_rate: 0.95
eval avg reward as player2: 9.80000
eval avg ep len as player2: 6.67000
eval_win_rate as player2: 0.99
eval avg reward against NM: -7.00000
eval avg ep len against NM: 10.31000
eval_win_rate against NM: 0.15
eval avg reward as player2 against NM: -9.00000
eval avg ep len as player2 against NM: 12.81000
eval_win_rate as player2 against NM: 0.07
max avg reward: 9.60000


 53%|█████████████████████▊                   | 16/30 [34:01<31:52, 136.63s/it]

time elapsed: 1:09:11
num of updates: 36000
action loss: 0.95747
eval avg reward: 9.40000
eval avg ep len: 8.02000
eval_win_rate: 0.97
eval avg reward as player2: 9.20000
eval avg ep len as player2: 7.41000
eval_win_rate as player2: 0.96
eval avg reward against NM: -5.60000
eval avg ep len against NM: 10.30000
eval_win_rate against NM: 0.22
eval avg reward as player2 against NM: -9.10000
eval avg ep len as player2 against NM: 10.64000
eval_win_rate as player2 against NM: 0.05
max avg reward: 9.60000


 57%|███████████████████████▏                 | 17/30 [36:17<29:33, 136.42s/it]

time elapsed: 1:11:26
num of updates: 37000
action loss: 0.94237
eval avg reward: 9.40000
eval avg ep len: 7.45000
eval_win_rate: 0.97
eval avg reward as player2: 8.60000
eval avg ep len as player2: 6.92000
eval_win_rate as player2: 0.93
eval avg reward against NM: -4.40000
eval avg ep len against NM: 11.60000
eval_win_rate against NM: 0.28
eval avg reward as player2 against NM: -9.00000
eval avg ep len as player2 against NM: 11.01000
eval_win_rate as player2 against NM: 0.05
max avg reward: 9.60000


 60%|████████████████████████▌                | 18/30 [38:40<27:42, 138.58s/it]

time elapsed: 1:13:50
num of updates: 38000
action loss: 0.93581
eval avg reward: 9.40000
eval avg ep len: 8.37000
eval_win_rate: 0.97
eval avg reward as player2: 9.40000
eval avg ep len as player2: 6.86000
eval_win_rate as player2: 0.97
eval avg reward against NM: -4.60000
eval avg ep len against NM: 12.11000
eval_win_rate against NM: 0.27
eval avg reward as player2 against NM: -8.20000
eval avg ep len as player2 against NM: 11.10000
eval_win_rate as player2 against NM: 0.10
max avg reward: 9.60000


 63%|█████████████████████████▉               | 19/30 [41:01<25:31, 139.22s/it]

time elapsed: 1:16:11
num of updates: 39000
action loss: 0.92617
eval avg reward: 9.00000
eval avg ep len: 7.86000
eval_win_rate: 0.95
eval avg reward as player2: 9.20000
eval avg ep len as player2: 7.33000
eval_win_rate as player2: 0.96
eval avg reward against NM: -5.00000
eval avg ep len against NM: 10.89000
eval_win_rate against NM: 0.25
eval avg reward as player2 against NM: -9.10000
eval avg ep len as player2 against NM: 11.08000
eval_win_rate as player2 against NM: 0.05
max avg reward: 9.60000


 67%|███████████████████████████▎             | 20/30 [43:23<23:20, 140.01s/it]

time elapsed: 1:18:33
num of updates: 40000
action loss: 0.91727
eval avg reward: 8.80000
eval avg ep len: 7.84000
eval_win_rate: 0.94
eval avg reward as player2: 8.60000
eval avg ep len as player2: 6.94000
eval_win_rate as player2: 0.93
eval avg reward against NM: -5.80000
eval avg ep len against NM: 10.41000
eval_win_rate against NM: 0.21
eval avg reward as player2 against NM: -9.60000
eval avg ep len as player2 against NM: 10.98000
eval_win_rate as player2 against NM: 0.02
max avg reward: 9.60000


 70%|████████████████████████████▋            | 21/30 [45:43<21:00, 140.02s/it]

time elapsed: 1:20:53
num of updates: 41000
action loss: 0.90819
eval avg reward: 9.40000
eval avg ep len: 7.80000
eval_win_rate: 0.97
eval avg reward as player2: 10.00000
eval avg ep len as player2: 7.09000
eval_win_rate as player2: 1.00
eval avg reward against NM: -6.40000
eval avg ep len against NM: 11.64000
eval_win_rate against NM: 0.18
eval avg reward as player2 against NM: -9.60000
eval avg ep len as player2 against NM: 10.89000
eval_win_rate as player2 against NM: 0.03
max avg reward: 9.60000


 73%|██████████████████████████████           | 22/30 [48:02<18:38, 139.82s/it]

time elapsed: 1:23:12
num of updates: 42000
action loss: 0.90015
eval avg reward: 9.80000
eval avg ep len: 7.99000
eval_win_rate: 0.99
eval avg reward as player2: 8.60000
eval avg ep len as player2: 7.38000
eval_win_rate as player2: 0.93
eval avg reward against NM: -5.00000
eval avg ep len against NM: 11.27000
eval_win_rate against NM: 0.25
eval avg reward as player2 against NM: -8.20000
eval avg ep len as player2 against NM: 11.97000
eval_win_rate as player2 against NM: 0.09
max avg reward: 9.60000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt


 77%|███████████████████████████████▍         | 23/30 [50:18<16:10, 138.65s/it]

time elapsed: 1:25:28
num of updates: 43000
action loss: 0.89316
eval avg reward: 9.40000
eval avg ep len: 7.36000
eval_win_rate: 0.97
eval avg reward as player2: 9.20000
eval avg ep len as player2: 7.22000
eval_win_rate as player2: 0.96
eval avg reward against NM: -5.40000
eval avg ep len against NM: 10.24000
eval_win_rate against NM: 0.23
eval avg reward as player2 against NM: -8.60000
eval avg ep len as player2 against NM: 10.86000
eval_win_rate as player2 against NM: 0.07
max avg reward: 9.80000


 80%|████████████████████████████████▊        | 24/30 [52:40<13:57, 139.60s/it]

time elapsed: 1:27:50
num of updates: 44000
action loss: 0.88630
eval avg reward: 9.40000
eval avg ep len: 7.93000
eval_win_rate: 0.97
eval avg reward as player2: 8.80000
eval avg ep len as player2: 6.89000
eval_win_rate as player2: 0.94
eval avg reward against NM: -3.40000
eval avg ep len against NM: 11.45000
eval_win_rate against NM: 0.33
eval avg reward as player2 against NM: -8.40000
eval avg ep len as player2 against NM: 11.91000
eval_win_rate as player2 against NM: 0.08
max avg reward: 9.80000


 83%|██████████████████████████████████▏      | 25/30 [55:03<11:43, 140.69s/it]

time elapsed: 1:30:13
num of updates: 45000
action loss: 0.87915
eval avg reward: 9.40000
eval avg ep len: 8.13000
eval_win_rate: 0.97
eval avg reward as player2: 9.80000
eval avg ep len as player2: 6.66000
eval_win_rate as player2: 0.99
eval avg reward against NM: -7.20000
eval avg ep len against NM: 12.27000
eval_win_rate against NM: 0.14
eval avg reward as player2 against NM: -8.50000
eval avg ep len as player2 against NM: 12.22000
eval_win_rate as player2 against NM: 0.08
max avg reward: 9.80000


 87%|███████████████████████████████████▌     | 26/30 [57:20<09:17, 139.44s/it]

time elapsed: 1:32:30
num of updates: 46000
action loss: 0.87167
eval avg reward: 9.40000
eval avg ep len: 7.57000
eval_win_rate: 0.97
eval avg reward as player2: 9.60000
eval avg ep len as player2: 7.56000
eval_win_rate as player2: 0.98
eval avg reward against NM: -2.60000
eval avg ep len against NM: 12.45000
eval_win_rate against NM: 0.37
eval avg reward as player2 against NM: -8.20000
eval avg ep len as player2 against NM: 10.31000
eval_win_rate as player2 against NM: 0.09
max avg reward: 9.80000


 90%|████████████████████████████████████▉    | 27/30 [59:37<06:56, 138.90s/it]

time elapsed: 1:34:47
num of updates: 47000
action loss: 0.86688
eval avg reward: 9.60000
eval avg ep len: 8.46000
eval_win_rate: 0.98
eval avg reward as player2: 9.40000
eval avg ep len as player2: 7.55000
eval_win_rate as player2: 0.97
eval avg reward against NM: -5.00000
eval avg ep len against NM: 11.63000
eval_win_rate against NM: 0.25
eval avg reward as player2 against NM: -4.70000
eval avg ep len as player2 against NM: 11.25000
eval_win_rate as player2 against NM: 0.29
max avg reward: 9.80000


 93%|████████████████████████████████████▍  | 28/30 [1:01:58<04:38, 139.30s/it]

time elapsed: 1:37:07
num of updates: 48000
action loss: 0.86063
eval avg reward: 9.80000
eval avg ep len: 7.17000
eval_win_rate: 0.99
eval avg reward as player2: 9.80000
eval avg ep len as player2: 7.03000
eval_win_rate as player2: 0.99
eval avg reward against NM: 0.00000
eval avg ep len against NM: 12.82000
eval_win_rate against NM: 0.50
eval avg reward as player2 against NM: -9.00000
eval avg ep len as player2 against NM: 11.78000
eval_win_rate as player2 against NM: 0.05
max avg reward: 9.80000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt


 97%|█████████████████████████████████████▋ | 29/30 [1:04:23<02:21, 141.14s/it]

time elapsed: 1:39:33
num of updates: 49000
action loss: 0.85507
eval avg reward: 9.40000
eval avg ep len: 7.80000
eval_win_rate: 0.97
eval avg reward as player2: 9.60000
eval avg ep len as player2: 7.09000
eval_win_rate as player2: 0.98
eval avg reward against NM: -0.60000
eval avg ep len against NM: 11.08000
eval_win_rate against NM: 0.47
eval avg reward as player2 against NM: -7.20000
eval avg ep len as player2 against NM: 12.32000
eval_win_rate as player2 against NM: 0.17
max avg reward: 9.80000


100%|███████████████████████████████████████| 30/30 [1:06:46<00:00, 133.55s/it]

time elapsed: 1:41:56
num of updates: 50000
action loss: 0.84728
eval avg reward: 9.20000
eval avg ep len: 8.09000
eval_win_rate: 0.96
eval avg reward as player2: 9.80000
eval avg ep len as player2: 6.99000
eval_win_rate as player2: 0.99
eval avg reward against NM: -3.00000
eval avg ep len against NM: 11.66000
eval_win_rate against NM: 0.35
eval avg reward as player2 against NM: -7.20000
eval avg ep len as player2 against NM: 12.52000
eval_win_rate as player2 against NM: 0.17
max avg reward: 9.80000
saving current model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_50000.pt



  3%|█▎                                      | 1/30 [02:26<1:10:40, 146.22s/it]

time elapsed: 1:44:23
num of updates: 51000
action loss: 1.33397
eval avg reward: 9.20000
eval avg ep len: 7.78000
eval_win_rate: 0.96
eval avg reward as player2: 8.60000
eval avg ep len as player2: 7.42000
eval_win_rate as player2: 0.93
eval avg reward against NM: -2.20000
eval avg ep len against NM: 13.11000
eval_win_rate against NM: 0.39
eval avg reward as player2 against NM: -2.20000
eval avg ep len as player2 against NM: 16.22000
eval_win_rate as player2 against NM: 0.43
max avg reward: 9.80000


  7%|██▋                                     | 2/30 [04:51<1:07:58, 145.65s/it]

time elapsed: 1:46:48
num of updates: 52000
action loss: 1.18811
eval avg reward: 8.40000
eval avg ep len: 9.55000
eval_win_rate: 0.92
eval avg reward as player2: 8.40000
eval avg ep len as player2: 8.39000
eval_win_rate as player2: 0.92
eval avg reward against NM: -2.60000
eval avg ep len against NM: 12.78000
eval_win_rate against NM: 0.37
eval avg reward as player2 against NM: -4.10000
eval avg ep len as player2 against NM: 15.66000
eval_win_rate as player2 against NM: 0.33
max avg reward: 9.80000


 10%|████                                    | 3/30 [07:17<1:05:41, 146.00s/it]

time elapsed: 1:49:14
num of updates: 53000
action loss: 1.12247
eval avg reward: 9.00000
eval avg ep len: 7.19000
eval_win_rate: 0.95
eval avg reward as player2: 7.30000
eval avg ep len as player2: 9.05000
eval_win_rate as player2: 0.87
eval avg reward against NM: -0.80000
eval avg ep len against NM: 11.68000
eval_win_rate against NM: 0.46
eval avg reward as player2 against NM: -3.90000
eval avg ep len as player2 against NM: 16.05000
eval_win_rate as player2 against NM: 0.35
max avg reward: 9.80000


 13%|█████▎                                  | 4/30 [09:35<1:01:49, 142.69s/it]

time elapsed: 1:51:32
num of updates: 54000
action loss: 1.07708
eval avg reward: 8.60000
eval avg ep len: 7.75000
eval_win_rate: 0.93
eval avg reward as player2: 7.80000
eval avg ep len as player2: 8.71000
eval_win_rate as player2: 0.89
eval avg reward against NM: -2.00000
eval avg ep len against NM: 12.22000
eval_win_rate against NM: 0.40
eval avg reward as player2 against NM: -3.40000
eval avg ep len as player2 against NM: 16.42000
eval_win_rate as player2 against NM: 0.39
max avg reward: 9.80000


 17%|███████                                   | 5/30 [11:50<58:21, 140.05s/it]

time elapsed: 1:53:47
num of updates: 55000
action loss: 1.04265
eval avg reward: 8.40000
eval avg ep len: 8.55000
eval_win_rate: 0.92
eval avg reward as player2: 6.30000
eval avg ep len as player2: 8.97000
eval_win_rate as player2: 0.82
eval avg reward against NM: 0.60000
eval avg ep len against NM: 12.86000
eval_win_rate against NM: 0.53
eval avg reward as player2 against NM: -5.30000
eval avg ep len as player2 against NM: 16.01000
eval_win_rate as player2 against NM: 0.26
max avg reward: 9.80000


 20%|████████▍                                 | 6/30 [14:05<55:20, 138.37s/it]

time elapsed: 1:56:02
num of updates: 56000
action loss: 1.01261
eval avg reward: 9.00000
eval avg ep len: 8.69000
eval_win_rate: 0.95
eval avg reward as player2: 8.10000
eval avg ep len as player2: 8.83000
eval_win_rate as player2: 0.91
eval avg reward against NM: 0.80000
eval avg ep len against NM: 12.51000
eval_win_rate against NM: 0.54
eval avg reward as player2 against NM: -0.30000
eval avg ep len as player2 against NM: 15.57000
eval_win_rate as player2 against NM: 0.50
max avg reward: 9.80000


 23%|█████████▊                                | 7/30 [16:24<53:00, 138.29s/it]

time elapsed: 1:58:21
num of updates: 57000
action loss: 0.98481
eval avg reward: 9.00000
eval avg ep len: 7.89000
eval_win_rate: 0.95
eval avg reward as player2: 8.80000
eval avg ep len as player2: 7.97000
eval_win_rate as player2: 0.94
eval avg reward against NM: -2.40000
eval avg ep len against NM: 12.84000
eval_win_rate against NM: 0.38
eval avg reward as player2 against NM: -3.00000
eval avg ep len as player2 against NM: 14.92000
eval_win_rate as player2 against NM: 0.37
max avg reward: 9.80000


 27%|███████████▏                              | 8/30 [18:58<52:32, 143.30s/it]

time elapsed: 2:00:55
num of updates: 58000
action loss: 0.96286
eval avg reward: 9.40000
eval avg ep len: 7.82000
eval_win_rate: 0.97
eval avg reward as player2: 8.20000
eval avg ep len as player2: 8.68000
eval_win_rate as player2: 0.91
eval avg reward against NM: -1.20000
eval avg ep len against NM: 14.90000
eval_win_rate against NM: 0.44
eval avg reward as player2 against NM: -3.20000
eval avg ep len as player2 against NM: 15.73000
eval_win_rate as player2 against NM: 0.37
max avg reward: 9.80000


 30%|████████████▌                             | 9/30 [21:35<51:44, 147.84s/it]

time elapsed: 2:03:32
num of updates: 59000
action loss: 0.94002
eval avg reward: 9.20000
eval avg ep len: 8.22000
eval_win_rate: 0.96
eval avg reward as player2: 8.40000
eval avg ep len as player2: 8.42000
eval_win_rate as player2: 0.92
eval avg reward against NM: 0.60000
eval avg ep len against NM: 12.97000
eval_win_rate against NM: 0.53
eval avg reward as player2 against NM: -2.50000
eval avg ep len as player2 against NM: 15.81000
eval_win_rate as player2 against NM: 0.41
max avg reward: 9.80000


 33%|█████████████▋                           | 10/30 [24:43<53:23, 160.16s/it]

time elapsed: 2:06:40
num of updates: 60000
action loss: 0.92603
eval avg reward: 8.80000
eval avg ep len: 8.36000
eval_win_rate: 0.94
eval avg reward as player2: 8.10000
eval avg ep len as player2: 8.09000
eval_win_rate as player2: 0.91
eval avg reward against NM: -1.00000
eval avg ep len against NM: 14.37000
eval_win_rate against NM: 0.45
eval avg reward as player2 against NM: -3.80000
eval avg ep len as player2 against NM: 16.19000
eval_win_rate as player2 against NM: 0.35
max avg reward: 9.80000


 37%|███████████████                          | 11/30 [28:28<56:58, 179.91s/it]

time elapsed: 2:10:25
num of updates: 61000
action loss: 0.90783
eval avg reward: 9.00000
eval avg ep len: 8.72000
eval_win_rate: 0.95
eval avg reward as player2: 6.60000
eval avg ep len as player2: 8.73000
eval_win_rate as player2: 0.83
eval avg reward against NM: 0.60000
eval avg ep len against NM: 13.51000
eval_win_rate against NM: 0.53
eval avg reward as player2 against NM: -2.60000
eval avg ep len as player2 against NM: 15.88000
eval_win_rate as player2 against NM: 0.40
max avg reward: 9.80000


 40%|████████████████▍                        | 12/30 [32:24<59:04, 196.90s/it]

time elapsed: 2:14:21
num of updates: 62000
action loss: 0.89315
eval avg reward: 9.20000
eval avg ep len: 7.61000
eval_win_rate: 0.96
eval avg reward as player2: 7.40000
eval avg ep len as player2: 8.95000
eval_win_rate as player2: 0.87
eval avg reward against NM: 4.00000
eval avg ep len against NM: 12.90000
eval_win_rate against NM: 0.70
eval avg reward as player2 against NM: -0.80000
eval avg ep len as player2 against NM: 16.33000
eval_win_rate as player2 against NM: 0.49
max avg reward: 9.80000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best_againstNM.pt


 43%|█████████████████▊                       | 13/30 [35:56<57:09, 201.72s/it]

time elapsed: 2:17:53
num of updates: 63000
action loss: 0.88017
eval avg reward: 9.20000
eval avg ep len: 8.58000
eval_win_rate: 0.96
eval avg reward as player2: 8.00000
eval avg ep len as player2: 8.66000
eval_win_rate as player2: 0.90
eval avg reward against NM: 1.60000
eval avg ep len against NM: 11.05000
eval_win_rate against NM: 0.58
eval avg reward as player2 against NM: -2.20000
eval avg ep len as player2 against NM: 15.26000
eval_win_rate as player2 against NM: 0.40
max avg reward: 9.80000


 47%|███████████████████▏                     | 14/30 [39:55<56:44, 212.78s/it]

time elapsed: 2:21:52
num of updates: 64000
action loss: 0.86454
eval avg reward: 8.40000
eval avg ep len: 8.83000
eval_win_rate: 0.92
eval avg reward as player2: 9.20000
eval avg ep len as player2: 8.40000
eval_win_rate as player2: 0.96
eval avg reward against NM: 0.80000
eval avg ep len against NM: 13.99000
eval_win_rate against NM: 0.54
eval avg reward as player2 against NM: -2.00000
eval avg ep len as player2 against NM: 16.07000
eval_win_rate as player2 against NM: 0.44
max avg reward: 9.80000


 50%|████████████████████▌                    | 15/30 [43:40<54:08, 216.57s/it]

time elapsed: 2:25:37
num of updates: 65000
action loss: 0.85194
eval avg reward: 8.80000
eval avg ep len: 7.89000
eval_win_rate: 0.94
eval avg reward as player2: 7.80000
eval avg ep len as player2: 8.81000
eval_win_rate as player2: 0.89
eval avg reward against NM: 4.80000
eval avg ep len against NM: 12.07000
eval_win_rate against NM: 0.74
eval avg reward as player2 against NM: -1.80000
eval avg ep len as player2 against NM: 15.83000
eval_win_rate as player2 against NM: 0.42
max avg reward: 9.80000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best_againstNM.pt


 53%|█████████████████████▊                   | 16/30 [47:30<51:29, 220.67s/it]

time elapsed: 2:29:27
num of updates: 66000
action loss: 0.84419
eval avg reward: 9.00000
eval avg ep len: 7.57000
eval_win_rate: 0.95
eval avg reward as player2: 7.60000
eval avg ep len as player2: 8.69000
eval_win_rate as player2: 0.88
eval avg reward against NM: 0.80000
eval avg ep len against NM: 12.79000
eval_win_rate against NM: 0.54
eval avg reward as player2 against NM: 1.10000
eval avg ep len as player2 against NM: 16.06000
eval_win_rate as player2 against NM: 0.61
max avg reward: 9.80000


 57%|███████████████████████▏                 | 17/30 [51:24<48:39, 224.56s/it]

time elapsed: 2:33:21
num of updates: 67000
action loss: 0.83359
eval avg reward: 9.20000
eval avg ep len: 8.29000
eval_win_rate: 0.96
eval avg reward as player2: 6.60000
eval avg ep len as player2: 8.58000
eval_win_rate as player2: 0.83
eval avg reward against NM: 0.60000
eval avg ep len against NM: 13.42000
eval_win_rate against NM: 0.53
eval avg reward as player2 against NM: -0.80000
eval avg ep len as player2 against NM: 16.52000
eval_win_rate as player2 against NM: 0.52
max avg reward: 9.80000


 60%|████████████████████████▌                | 18/30 [55:15<45:18, 226.53s/it]

time elapsed: 2:37:12
num of updates: 68000
action loss: 0.82222
eval avg reward: 8.80000
eval avg ep len: 8.55000
eval_win_rate: 0.94
eval avg reward as player2: 7.00000
eval avg ep len as player2: 8.74000
eval_win_rate as player2: 0.85
eval avg reward against NM: -1.20000
eval avg ep len against NM: 14.45000
eval_win_rate against NM: 0.44
eval avg reward as player2 against NM: 0.00000
eval avg ep len as player2 against NM: 15.45000
eval_win_rate as player2 against NM: 0.53
max avg reward: 9.80000


 63%|█████████████████████████▉               | 19/30 [59:10<42:00, 229.15s/it]

time elapsed: 2:41:07
num of updates: 69000
action loss: 0.81388
eval avg reward: 9.80000
eval avg ep len: 8.24000
eval_win_rate: 0.99
eval avg reward as player2: 7.00000
eval avg ep len as player2: 9.29000
eval_win_rate as player2: 0.85
eval avg reward against NM: 2.80000
eval avg ep len against NM: 13.72000
eval_win_rate against NM: 0.64
eval avg reward as player2 against NM: -1.30000
eval avg ep len as player2 against NM: 15.87000
eval_win_rate as player2 against NM: 0.45
max avg reward: 9.80000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best.pt


 67%|██████████████████████████             | 20/30 [1:03:03<38:21, 230.13s/it]

time elapsed: 2:45:00
num of updates: 70000
action loss: 0.80859
eval avg reward: 8.20000
eval avg ep len: 8.78000
eval_win_rate: 0.91
eval avg reward as player2: 7.60000
eval avg ep len as player2: 8.21000
eval_win_rate as player2: 0.88
eval avg reward against NM: 5.20000
eval avg ep len against NM: 13.04000
eval_win_rate against NM: 0.76
eval avg reward as player2 against NM: -2.70000
eval avg ep len as player2 against NM: 15.85000
eval_win_rate as player2 against NM: 0.39
max avg reward: 9.80000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best_againstNM.pt


 70%|███████████████████████████▎           | 21/30 [1:06:49<34:19, 228.82s/it]

time elapsed: 2:48:45
num of updates: 71000
action loss: 0.79778
eval avg reward: 8.80000
eval avg ep len: 8.37000
eval_win_rate: 0.94
eval avg reward as player2: 8.70000
eval avg ep len as player2: 8.70000
eval_win_rate as player2: 0.94
eval avg reward against NM: 2.40000
eval avg ep len against NM: 11.82000
eval_win_rate against NM: 0.62
eval avg reward as player2 against NM: -1.80000
eval avg ep len as player2 against NM: 16.33000
eval_win_rate as player2 against NM: 0.42
max avg reward: 9.80000


 73%|████████████████████████████▌          | 22/30 [1:10:42<30:42, 230.33s/it]

time elapsed: 2:52:39
num of updates: 72000
action loss: 0.79058
eval avg reward: 9.40000
eval avg ep len: 7.65000
eval_win_rate: 0.97
eval avg reward as player2: 7.80000
eval avg ep len as player2: 7.63000
eval_win_rate as player2: 0.89
eval avg reward against NM: 2.20000
eval avg ep len against NM: 14.39000
eval_win_rate against NM: 0.61
eval avg reward as player2 against NM: -1.80000
eval avg ep len as player2 against NM: 15.61000
eval_win_rate as player2 against NM: 0.44
max avg reward: 9.80000


 77%|█████████████████████████████▉         | 23/30 [1:13:26<24:32, 210.39s/it]

time elapsed: 2:55:23
num of updates: 73000
action loss: 0.78732
eval avg reward: 9.00000
eval avg ep len: 7.44000
eval_win_rate: 0.95
eval avg reward as player2: 7.00000
eval avg ep len as player2: 8.33000
eval_win_rate as player2: 0.85
eval avg reward against NM: 3.00000
eval avg ep len against NM: 12.35000
eval_win_rate against NM: 0.65
eval avg reward as player2 against NM: 0.10000
eval avg ep len as player2 against NM: 15.19000
eval_win_rate as player2 against NM: 0.53
max avg reward: 9.80000


 80%|███████████████████████████████▏       | 24/30 [1:17:25<21:53, 218.89s/it]

time elapsed: 2:59:22
num of updates: 74000
action loss: 0.77647
eval avg reward: 8.60000
eval avg ep len: 7.74000
eval_win_rate: 0.93
eval avg reward as player2: 8.00000
eval avg ep len as player2: 8.17000
eval_win_rate as player2: 0.90
eval avg reward against NM: 1.20000
eval avg ep len against NM: 13.60000
eval_win_rate against NM: 0.56
eval avg reward as player2 against NM: 1.40000
eval avg ep len as player2 against NM: 17.23000
eval_win_rate as player2 against NM: 0.66
max avg reward: 9.80000


 83%|████████████████████████████████▌      | 25/30 [1:21:18<18:35, 223.16s/it]

time elapsed: 3:03:15
num of updates: 75000
action loss: 0.76983
eval avg reward: 9.20000
eval avg ep len: 8.77000
eval_win_rate: 0.96
eval avg reward as player2: 7.40000
eval avg ep len as player2: 8.74000
eval_win_rate as player2: 0.87
eval avg reward against NM: 3.60000
eval avg ep len against NM: 13.99000
eval_win_rate against NM: 0.68
eval avg reward as player2 against NM: -1.50000
eval avg ep len as player2 against NM: 15.63000
eval_win_rate as player2 against NM: 0.44
max avg reward: 9.80000


 87%|█████████████████████████████████▊     | 26/30 [1:25:12<15:05, 226.47s/it]

time elapsed: 3:07:09
num of updates: 76000
action loss: 0.76597
eval avg reward: 8.60000
eval avg ep len: 8.19000
eval_win_rate: 0.93
eval avg reward as player2: 7.80000
eval avg ep len as player2: 8.66000
eval_win_rate as player2: 0.89
eval avg reward against NM: 2.00000
eval avg ep len against NM: 15.14000
eval_win_rate against NM: 0.60
eval avg reward as player2 against NM: -1.20000
eval avg ep len as player2 against NM: 15.87000
eval_win_rate as player2 against NM: 0.48
max avg reward: 9.80000


 90%|███████████████████████████████████    | 27/30 [1:28:50<11:11, 223.98s/it]

time elapsed: 3:10:47
num of updates: 77000
action loss: 0.75737
eval avg reward: 8.20000
eval avg ep len: 8.39000
eval_win_rate: 0.91
eval avg reward as player2: 7.80000
eval avg ep len as player2: 8.46000
eval_win_rate as player2: 0.89
eval avg reward against NM: 2.00000
eval avg ep len against NM: 13.32000
eval_win_rate against NM: 0.60
eval avg reward as player2 against NM: 1.20000
eval avg ep len as player2 against NM: 16.20000
eval_win_rate as player2 against NM: 0.58
max avg reward: 9.80000


 93%|████████████████████████████████████▍  | 28/30 [1:32:50<07:37, 228.53s/it]

time elapsed: 3:14:47
num of updates: 78000
action loss: 0.75151
eval avg reward: 8.40000
eval avg ep len: 8.81000
eval_win_rate: 0.92
eval avg reward as player2: 8.20000
eval avg ep len as player2: 8.52000
eval_win_rate as player2: 0.91
eval avg reward against NM: 0.00000
eval avg ep len against NM: 13.64000
eval_win_rate against NM: 0.50
eval avg reward as player2 against NM: 0.60000
eval avg ep len as player2 against NM: 16.77000
eval_win_rate as player2 against NM: 0.57
max avg reward: 9.80000


 97%|█████████████████████████████████████▋ | 29/30 [1:36:33<03:47, 227.01s/it]

time elapsed: 3:18:30
num of updates: 79000
action loss: 0.74519
eval avg reward: 9.40000
eval avg ep len: 8.46000
eval_win_rate: 0.97
eval avg reward as player2: 8.80000
eval avg ep len as player2: 7.95000
eval_win_rate as player2: 0.94
eval avg reward against NM: 2.60000
eval avg ep len against NM: 13.18000
eval_win_rate against NM: 0.63
eval avg reward as player2 against NM: 0.00000
eval avg ep len as player2 against NM: 16.42000
eval_win_rate as player2 against NM: 0.53
max avg reward: 9.80000


100%|███████████████████████████████████████| 30/30 [1:40:23<00:00, 200.79s/it]

time elapsed: 3:22:20
num of updates: 80000
action loss: 0.74033
eval avg reward: 8.60000
eval avg ep len: 8.75000
eval_win_rate: 0.93
eval avg reward as player2: 7.80000
eval avg ep len as player2: 8.47000
eval_win_rate as player2: 0.89
eval avg reward against NM: 5.20000
eval avg ep len against NM: 11.82000
eval_win_rate against NM: 0.76
eval avg reward as player2 against NM: -1.30000
eval avg ep len as player2 against NM: 16.24000
eval_win_rate as player2 against NM: 0.47
max avg reward: 9.80000
saving max avg reward model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_best_againstNM.pt
saving current model at: ./dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_80000.pt
finished training!
started training at: 23-02-28-17-54-34
finished training at: 23-02-28-21-16-54
total training time: 3:22:20
max avg reward: 9.80000
saved max avg reward model at: 


