# Self-Play

In [1]:
import sys
sys.path.insert(0, '../../src/')

import numpy as np
import matplotlib.pyplot as plt
import pickle
import config
import torch
from tqdm.notebook import tqdm
from copy import copy, deepcopy
import cmath
import chess
from utils import saver, loader

from agents import *
from environments import *
from models import *
%matplotlib inline

np.set_printoptions(precision = 3)

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


In [2]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
agent = Agent(board_logic = BoardLogic(max_num_moves=100), 
              in_ch=18, 
              ch=128, 
              n_blocks=10)

print(sum(p.numel() for p in agent.online_net1.parameters() if p.requires_grad))
environment = Environment(max_num_moves=100,
                          filter_blunder=False, # causes a lot of draw in early self-play of on
                          )

opt_list = [None, None]

model = Model(agent = agent,
               environment = environment,
               mem_capacity = 1000000,
               init_mem = False,
               batch_size = 512,
               num_warmup = 100000,
               policy_update = 2,
               tau = 0.01,
               temp_scaler = TemperatureScaler(temp_start=0.6, 
                                               temp_end=0.2, 
                                               temp_min=1e-5, 
                                               episode_decay=5000, 
                                               transition_decay=0.95),
               opt_list=opt_list,
               scaler=torch.amp.GradScaler("cuda")
             )

optimizer_grouped_parameters1 = group_decay_parameters(
    agent.online_net1,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

optimizer_grouped_parameters2 = group_decay_parameters(
    agent.online_net2,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

opt_list[0] = torch.optim.AdamW(optimizer_grouped_parameters1, lr=1e-4)
opt_list[1] = torch.optim.AdamW(optimizer_grouped_parameters2, lr=1e-4)


2984908


In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_conv_5000_episodes_core.pth")
save_memory(model, filename="model_conv_5000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=500)

save_core(model, filename="model_conv_10000_episodes_core.pth")
save_memory(model, filename="model_conv_10000_episodes_memory.pth")

In [None]:
model = load_checkpoint(core_path="model_conv_10000_episodes_core.pth",
                        memory_path="model_conv_10000_episodes_memory.pth",
                        model = model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_conv_15000_episodes_core.pth")
save_memory(model, filename="model_conv_15000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_conv_20000_episodes_core.pth")
save_memory(model, filename="model_conv_20000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_conv_25000_episodes_core.pth")
save_memory(model, filename="model_conv_25000_episodes_memory.pth")

In [None]:
model = load_checkpoint(core_path="model_conv_25000_episodes_core.pth",
                        memory_path="model_conv_25000_episodes_memory.pth",
                        model = model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_conv_30000_episodes_core.pth")
save_memory(model, filename="model_conv_30000_episodes_memory.pth")

In [None]:
model = load_checkpoint(core_path="model_conv_30000_episodes_core.pth",
                        memory_path="model_conv_30000_episodes_memory.pth",
                        model = model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_conv_35000_episodes_core.pth")
save_memory(model, filename="model_conv_35000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

{1: 163, -1: 175, 0: 162} 0 0.07016114427480759


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 171, -1: 180, 0: 149} tensor(0.0005, device='cuda:0') 0.07321388461061686


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 194, -1: 173, 0: 133} tensor(0.0006, device='cuda:0') 0.06667578115045805


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 169, -1: 192, 0: 139} tensor(0.0004, device='cuda:0') 0.0669390497354827


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 189, -1: 152, 0: 159} tensor(0.0005, device='cuda:0') 0.06887233911315435


: 

In [None]:
model = load_checkpoint(core_path="model_conv_35000_episodes_core.pth",
                        memory_path="model_conv_35000_episodes_memory.pth",
                        model = model)


agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_conv_40000_episodes_core.pth")
save_memory(model, filename="model_conv_40000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

{1: 176, -1: 195, 0: 129} 0 0.0747083069473578


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 173, -1: 179, 0: 148} tensor(0.0005, device='cuda:0') 0.06817331567826662


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 185, -1: 175, 0: 140} tensor(0.0004, device='cuda:0') 0.06968849941572149


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 177, -1: 191, 0: 132} tensor(0.0006, device='cuda:0') 0.06522818079207268


## Eval

In [None]:
model = load_checkpoint("model_conv_35000_episodes_large.pth", model)
agent1 = deepcopy(model.agent)
model = load_checkpoint("model_conv_35000_episodes_large.pth", model)
agent2 = model.agent

eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200), 
                             num_games=100,
                             temp = 0.25)

results = eval_agents.evaluate()
print(results)

In [4]:
model = load_checkpoint("model_conv_35000_episodes_large.pth", None,  model)
agent1 = deepcopy(model.agent)
model = load_checkpoint("model_conv_35000_episodes_large.pth", model)
agent2 = model.agent

eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,filter_blunder=True), 
                             num_games=100,
                             temp = 0.25)

results = eval_agents.evaluate()
print(results)

RuntimeError: Error(s) in loading state_dict for Agent:
	Unexpected key(s) in state_dict: "online_net1.rank_plane", "online_net1.file_plane", "online_net2.rank_plane", "online_net2.file_plane", "target_net1.rank_plane", "target_net1.file_plane", "target_net2.rank_plane", "target_net2.file_plane". 
	size mismatch for online_net1.stem.weight: copying a param with shape torch.Size([128, 14, 3, 3]) from checkpoint, the shape in current model is torch.Size([128, 18, 3, 3]).
	size mismatch for online_net2.stem.weight: copying a param with shape torch.Size([128, 14, 3, 3]) from checkpoint, the shape in current model is torch.Size([128, 18, 3, 3]).
	size mismatch for target_net1.stem.weight: copying a param with shape torch.Size([128, 14, 3, 3]) from checkpoint, the shape in current model is torch.Size([128, 18, 3, 3]).
	size mismatch for target_net2.stem.weight: copying a param with shape torch.Size([128, 14, 3, 3]) from checkpoint, the shape in current model is torch.Size([128, 18, 3, 3]).

## Play

In [None]:
model = load_checkpoint("model_conv_35000_episodes_large.pth", model)

In [None]:
environment = Environment(max_num_moves=200,filter_blunder=False)
environment.reset()
#random.seed(42)
#np.random.seed(42)
#torch.manual_seed(42)
temp = 0.1

In [None]:
action = agent.select_action(environment, temp=temp, greedy=True)
move = agent.action_to_move(action)

board, (reward, done) = environment.step(move)

state = agent.board_logic.board_to_state(board).to(config.device)

Q1 = agent.online_net1(state).detach()
Q2 = agent.online_net2(state).detach()
legal_moves = environment.get_legal_moves()
mask_legal = agent.get_mask_legal(legal_moves)

Q1_legal = Q1[mask_legal]
Q2_legal = Q2[mask_legal]

diff = torch.abs(Q1_legal - Q2_legal)/torch.max(torch.abs(Q1_legal), torch.abs(Q2_legal))

print(f"{np.mean(diff.cpu().numpy()):.4f}")

Q_legal = Q1.masked_fill(~mask_legal, -1e9)
action_star = torch.argmax(Q_legal, dim=1).to(config.device)
score = Q2[0,action_star[0]]

if environment.mirror:
    print("Black:")
    print(f"score: {score.item():.4f}")
    print(board.mirror())
else:
    print("White:")
    print(f"score: {score.item():.4f}")
    print(board)


if board.is_checkmate():
    print("checkmate!")

Q_legal = Q1.masked_fill(~mask_legal, -float('inf'))
#print(Q_legal[Q_legal>-1])

q_max = Q_legal.max(dim=1, keepdim=True).values
print(q_max)

logits = (Q_legal - q_max)/temp
print(torch.sort(logits, descending=True).values[0,:10])
probs = torch.softmax(logits, dim=1)
print(probs[logits>-1])
temp = temp*0.95
print(f"temp: {temp:.4f}")

In [None]:
agent.board_to_state(environment.get_board())[0,12]