# Self-Play

In [1]:
import sys
sys.path.insert(0, '../../src/')

import numpy as np
import matplotlib.pyplot as plt
import pickle
import config
import torch
from tqdm.notebook import tqdm
from copy import copy, deepcopy
import cmath
import chess
from utils import *
from evals import *

from agents import *
from environments import *
from models import *
%matplotlib inline

np.set_printoptions(precision = 3)

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


In [2]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
agent = Agent(board_logic = BoardLogic(max_num_moves=100), 
              in_ch=20, 
              ch=128, 
              n_blocks=10,
              sample_policy=eps_greedy_policy,)

print(sum(p.numel() for p in agent.online_net1.parameters() if p.requires_grad))
environment = Environment(max_num_moves=100,
                          filter_blunder=False, # causes a lot of draw in early self-play of on
                          )

opt_list = [None, None]

model = Model(agent = agent,
               environment = environment,
               mem_capacity = 1000000,
               init_mem = False,
               batch_size = 512,
               num_warmup = 100000,
               policy_update = 2,
               tau = 0.01,
               temp_scaler = TemperatureScaler(temp_start=0.5, 
                                               temp_end=0.25, 
                                               temp_min=5e-2, 
                                               episode_decay=5000, 
                                               transition_decay=0.9),
               opt_list=opt_list,
               scaler=torch.amp.GradScaler("cuda")
             )

optimizer_grouped_parameters1 = group_decay_parameters(
    agent.online_net1,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

optimizer_grouped_parameters2 = group_decay_parameters(
    agent.online_net2,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

opt_list[0] = torch.optim.AdamW(optimizer_grouped_parameters1, lr=1e-4)
opt_list[1] = torch.optim.AdamW(optimizer_grouped_parameters2, lr=1e-4)


2987212


In [5]:
save_core(model, filename="../models/" + "model_eps_0_episodes_core.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.2)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="../models/" + "model_eps_5000_episodes_core.pth")
save_memory(model, filename="../models/" + "model_eps_5000_episodes_memory.pth")

100%|██████████| 500/500 [05:55<00:00,  1.41it/s]it]


{1: 171, -1: 86, 0: 243} tensor(0.0009, device='cuda:0') 0.03234280625343643


100%|██████████| 500/500 [06:30<00:00,  1.28it/s]0s/it] 


{1: 134, -1: 79, 0: 287} tensor(0.0006, device='cuda:0') 0.02854610602298635


100%|██████████| 500/500 [06:03<00:00,  1.38it/s]7s/it]   


{1: 150, -1: 77, 0: 273} tensor(0.0005, device='cuda:0') 0.037198698933534544


100%|██████████| 500/500 [04:15<00:00,  1.96it/s]/it]    


{1: 273, -1: 62, 0: 165} tensor(0.0003, device='cuda:0') 0.04037661922812535


100%|██████████| 500/500 [04:09<00:00,  2.00it/s]/it]   


{1: 319, -1: 74, 0: 107} tensor(0.0003, device='cuda:0') 0.04886865392162635


100%|██████████| 5000/5000 [3:46:16<00:00,  2.72s/it]


In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="models" + "model_eps_10000_episodes_core.pth")
save_memory(model, filename="models" + "model_eps_10000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="models" + "model_eps_15000_episodes_core.pth")
save_memory(model, filename="models" + "model_eps_15000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="models" + "model_eps_20000_episodes_core.pth")
save_memory(model, filename="models" + "model_eps_20000_episodes_memory.pth")

In [None]:
model = load_checkpoint("models" + "model_eps_20000_episodes_core.pth", 
                        "models" + "model_eps_20000_episodes_memory.pth", 
                        model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="models" + "model_eps_25000_episodes_core.pth")
save_memory(model, filename="models" + "model_eps_25000_episodes_memory.pth")

In [None]:
model = load_checkpoint("models" + "model_eps_25000_episodes_core.pth", 
                        "models" + "model_eps_25000_episodes_memory.pth", 
                        model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_30000_episodes_core.pth")
save_memory(model, filename="model_eps_30000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_35000_episodes_core.pth")
save_memory(model, filename="model_eps_35000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_40000_episodes_core.pth")
save_memory(model, filename="model_eps_40000_episodes_memory.pth")

In [None]:
model = load_checkpoint("models" + "model_eps_40000_episodes_core.pth", 
                        "models" + "model_eps_40000_episodes_memory.pth", 
                        model)

model.temp_scaler.temp_min = 0.0001

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_45000_episodes_core.pth")
save_memory(model, filename="model_eps_45000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

{1: 176, -1: 167, 0: 157} tensor(0.0007, device='cuda:0') 0.06926783991142844


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 195, -1: 151, 0: 154} tensor(0.0007, device='cuda:0') 0.06784952987691713


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 202, -1: 151, 0: 147} tensor(0.0005, device='cuda:0') 0.06423700579040337


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 203, -1: 147, 0: 150} tensor(0.0004, device='cuda:0') 0.07187870628355691


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 188, -1: 173, 0: 139} tensor(0.0005, device='cuda:0') 0.069119074476118


In [None]:
model = load_checkpoint("models" + "model_eps_45000_episodes_core.pth", 
                        "models" + "model_eps_45000_episodes_memory.pth", 
                        model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, W
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_50000_episodes_core.pth")
save_memory(model, filename="model_eps_50000_episodes_memory.pth")

100%|██████████| 500/500 [05:10<00:00,  1.61it/s]it]


{1: 193, -1: 171, 0: 136} tensor(0.0005, device='cuda:0') 0.06802354746430468


100%|██████████| 500/500 [04:58<00:00,  1.67it/s]4s/it]


{1: 197, -1: 160, 0: 143} tensor(0.0004, device='cuda:0') 0.07134929691485531


100%|██████████| 500/500 [04:51<00:00,  1.72it/s]5s/it] 


{1: 178, -1: 163, 0: 159} tensor(0.0006, device='cuda:0') 0.07071662782956727


100%|██████████| 500/500 [04:52<00:00,  1.71it/s]/it]   


{1: 188, -1: 156, 0: 156} tensor(0.0005, device='cuda:0') 0.07188114238613141


100%|██████████| 500/500 [05:00<00:00,  1.66it/s]/it]   


{1: 193, -1: 168, 0: 139} tensor(0.0005, device='cuda:0') 0.07304825423487668


100%|██████████| 5000/5000 [4:31:04<00:00,  3.25s/it]


In [3]:
model = load_checkpoint("../models/" + "model_eps_50000_episodes_core.pth", 
                        "../models/" + "model_eps_50000_episodes_memory.pth", 
                        model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="../models/" + "model_eps_55000_episodes_core.pth")
save_memory(model, filename="../models/" + "model_eps_55000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

{1: 192, -1: 167, 0: 141} tensor(0.0009, device='cuda:0') 0.07176791616644132


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 196, -1: 176, 0: 128} tensor(0.0005, device='cuda:0') 0.06464436638065105


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 198, -1: 159, 0: 143} tensor(0.0004, device='cuda:0') 0.07462190842409044


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 199, -1: 152, 0: 149} tensor(0.0006, device='cuda:0') 0.06705005466024958


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 226, -1: 144, 0: 130} tensor(0.0005, device='cuda:0') 0.06744239958622823


In [4]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="../models/" + "model_eps_60000_episodes_core.pth")
save_memory(model, filename="../models/" + "model_eps_60000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

{1: 186, -1: 194, 0: 120} tensor(0.0003, device='cuda:0') 0.06649583072547673


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 196, -1: 175, 0: 129} tensor(0.0004, device='cuda:0') 0.06839923159709785


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 167, -1: 188, 0: 145} tensor(0.0004, device='cuda:0') 0.06945762031447641


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 201, -1: 179, 0: 120} tensor(0.0003, device='cuda:0') 0.0724520273100329


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 203, -1: 149, 0: 148} tensor(0.0004, device='cuda:0') 0.06931247224428636


In [5]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="../models/" + "model_eps_65000_episodes_core.pth")
save_memory(model, filename="../models/" + "model_eps_65000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

{1: 202, -1: 167, 0: 131} tensor(0.0012, device='cuda:0') 0.06770572747712601


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 181, -1: 181, 0: 138} tensor(0.0004, device='cuda:0') 0.06766560005563069


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 198, -1: 162, 0: 140} tensor(0.0006, device='cuda:0') 0.06762453827783758


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 188, -1: 191, 0: 121} tensor(0.0004, device='cuda:0') 0.06452875930916371


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 184, -1: 176, 0: 140} tensor(0.0005, device='cuda:0') 0.06883065876464807


In [6]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="../models/" + "model_eps_70000_episodes_core.pth")
save_memory(model, filename="../models/" + "model_eps_70000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

{1: 199, -1: 176, 0: 125} tensor(0.0005, device='cuda:0') 0.06535925589497292


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 198, -1: 159, 0: 143} tensor(0.0003, device='cuda:0') 0.06726790254059083


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 192, -1: 178, 0: 130} tensor(0.0003, device='cuda:0') 0.0711689161586916


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 202, -1: 184, 0: 114} tensor(0.0005, device='cuda:0') 0.06294757271631012


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 176, -1: 182, 0: 142} tensor(0.0004, device='cuda:0') 0.06489131975500188
