# Self-Play

In [1]:
import sys
sys.path.insert(0, '../../src/')

import numpy as np
import matplotlib.pyplot as plt
import pickle
import config
import torch
from tqdm.notebook import tqdm
from copy import copy, deepcopy
import cmath
import chess
from utils import *

from agents import *
from environments import *
from models import *
%matplotlib inline

np.set_printoptions(precision = 3)

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


In [None]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
agent = Agent(board_logic = BoardLogic(max_num_moves=100), 
              in_ch=20, 
              ch=128, 
              n_blocks=10,
              sample_policy=eps_greedy_policy,)

print(sum(p.numel() for p in agent.online_net1.parameters() if p.requires_grad))
environment = Environment(max_num_moves=100,
                          filter_blunder=False, # causes a lot of draw in early self-play of on
                          )

opt_list = [None, None]

model = Model(agent = agent,
               environment = environment,
               mem_capacity = 1000000,
               init_mem = False,
               batch_size = 512,
               num_warmup = 100000,
               policy_update = 2,
               tau = 0.01,
               temp_scaler = TemperatureScaler(temp_start=0.5, 
                                               temp_end=0.25, 
                                               temp_min=5e-2, 
                                               episode_decay=5000, 
                                               transition_decay=0.9),
               opt_list=opt_list,
               scaler=torch.amp.GradScaler("cuda")
             )

optimizer_grouped_parameters1 = group_decay_parameters(
    agent.online_net1,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

optimizer_grouped_parameters2 = group_decay_parameters(
    agent.online_net2,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

opt_list[0] = torch.optim.AdamW(optimizer_grouped_parameters1, lr=1e-4)
opt_list[1] = torch.optim.AdamW(optimizer_grouped_parameters2, lr=1e-4)


2987212


In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.2)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_5000_episodes_core.pth")
save_memory(model, filename="model_5000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_10000_episodes_core.pth")
save_memory(model, filename="model_eps_10000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_15000_episodes_core.pth")
save_memory(model, filename="model_eps_15000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_20000_episodes_core.pth")
save_memory(model, filename="model_eps_20000_episodes_memory.pth")

In [None]:
# With boltzmann

model = load_checkpoint("model_eps_20000_episodes_core.pth", "model_eps_20000_episodes_memory.pth", model)

model.agent.sample_policy = eps_softmax_policy

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_25000_episodes_core.pth")
save_memory(model, filename="model_eps_25000_episodes_memory.pth")

In [None]:
# continue with eps

model = load_checkpoint("model_eps_20000_episodes_core.pth", "model_eps_20000_episodes_memory.pth", model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_25000_episodes_core.pth")
save_memory(model, filename="model_eps_25000_episodes_memory.pth")

In [None]:
model = load_checkpoint("model_eps_25000_episodes_core.pth", "model_eps_25000_episodes_memory.pth", model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_30000_episodes_core.pth")
save_memory(model, filename="model_eps_30000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_35000_episodes_core.pth")
save_memory(model, filename="model_eps_35000_episodes_memory.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_40000_episodes_core.pth")
save_memory(model, filename="model_eps_40000_episodes_memory.pth")

In [3]:
model = load_checkpoint("model_eps_40000_episodes_core.pth", "model_eps_40000_episodes_memory.pth", model)

model.temp_scaler.temp_min = 0.0001

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_45000_episodes_core.pth")
save_memory(model, filename="model_eps_45000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

{1: 176, -1: 167, 0: 157} tensor(0.0007, device='cuda:0') 0.06926783991142844


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 195, -1: 151, 0: 154} tensor(0.0007, device='cuda:0') 0.06784952987691713


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 202, -1: 151, 0: 147} tensor(0.0005, device='cuda:0') 0.06423700579040337


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 203, -1: 147, 0: 150} tensor(0.0004, device='cuda:0') 0.07187870628355691


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 188, -1: 173, 0: 139} tensor(0.0005, device='cuda:0') 0.069119074476118


In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="model_eps_50000_episodes_core.pth")
save_memory(model, filename="model_eps_50000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
del model

## Play

In [None]:
model = load_checkpoint("model_eps_40000_episodes_core.pth", None, model)

In [None]:
environment = Environment(max_num_moves=100, filter_blunder=False)
environment.reset()
#random.seed(42)
#np.random.seed(42)
#torch.manual_seed(42)
temp = 0.1
environment.board = chess.Board('rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNB1KBNR w KQkq - 0 1')

In [None]:
environment.board

In [None]:
action = agent.select_action(environment, temp=temp, greedy=True)
move = agent.action_to_move(action)

board, (reward, done) = environment.step(move)

state = agent.board_logic.board_to_state(board).to(config.device)

Q1 = agent.online_net1(state).detach()
Q2 = agent.online_net2(state).detach()
legal_moves = environment.get_legal_moves()
mask_legal = agent.get_mask_legal(legal_moves)

Q1_legal = Q1[mask_legal]
Q2_legal = Q2[mask_legal]

diff = torch.abs(Q1_legal - Q2_legal)

print(f"{np.mean(diff.cpu().numpy()):.4f}")

Q_legal = Q1.masked_fill(~mask_legal, -1e9)
action_star = torch.argmax(Q_legal, dim=1).to(config.device)
score = Q2[0,action_star[0]]

if environment.mirror:
    print("Black:")
    print(f"score: {score.item():.4f}")
    print(board.mirror())
else:
    print("White:")
    print(f"score: {score.item():.4f}")
    print(board)


if board.is_checkmate():
    print("checkmate!")

Q_legal = Q1.masked_fill(~mask_legal, -1e5)
#print(Q_legal[Q_legal>-1])

q_max = Q_legal.max(dim=1, keepdim=True).values
print(Q_legal[Q_legal>-100])
#print(q_max)
logits = (Q_legal - q_max)
#print(torch.sort(logits, descending=True).values[0,:10])
probs = torch.softmax(logits/0.1, dim=1)
print(probs[logits>-2])
temp = temp*0.95
print(f"temp: {temp:.4f}")

In [None]:
done

In [None]:
state[0,16]

In [None]:
agent.board_to_state(environment.get_board())[0,12]

In [None]:
model = load_checkpoint("model_conv_55000_episodes_core.pth", "model_conv_55000_episodes_memory.pth", model)

In [None]:
model.memory

In [None]:
board = chess.Board()