# Self-Play

In [1]:
import sys
sys.path.insert(0, '../../src/')

import numpy as np
import matplotlib.pyplot as plt
import pickle
import config
import torch
from tqdm.notebook import tqdm
from copy import copy, deepcopy
import cmath
import chess
from utils import *
from evals import *

from agents import *
from environments import *
from models import *
%matplotlib inline

np.set_printoptions(precision = 3)

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


In [2]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
agent = Agent(board_logic = BoardLogic(max_num_moves=100), 
              in_ch=20, 
              ch=128, 
              n_blocks=10,
              sample_policy=eps_greedy_policy,)

print(sum(p.numel() for p in agent.online_net1.parameters() if p.requires_grad))
environment = Environment(max_num_moves=100,
                          filter_blunder=False, # causes a lot of draw in early self-play of on
                          )

opt_list = [None, None]

model = Model(agent = agent,
               environment = environment,
               mem_capacity = 1000000,
               init_mem = False,
               batch_size = 512,
               num_warmup = 100000,
               policy_update = 2,
               tau = 0.01,
               temp_scaler = TemperatureScaler(temp_start=0.5, 
                                               temp_end=0.25, 
                                               temp_min=1e-4, 
                                               episode_decay=5000, 
                                               transition_decay=0.9),
               opt_list=opt_list,
               scaler=torch.amp.GradScaler("cuda")
             )

optimizer_grouped_parameters1 = group_decay_parameters(
    agent.online_net1,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

optimizer_grouped_parameters2 = group_decay_parameters(
    agent.online_net2,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

opt_list[0] = torch.optim.AdamW(optimizer_grouped_parameters1, lr=1e-4)
opt_list[1] = torch.optim.AdamW(optimizer_grouped_parameters2, lr=1e-4)


2987212


In [3]:
model = load_checkpoint("../models/" + "model_eps_70000_episodes_core.pth", 
                        "../models/" + "model_eps_70000_episodes_memory.pth", 
                        model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=250,
                             temp = 0.25,
                             transition_decay=0.95,
                             depth=2,
                             breadth=4)

model.train(num_episodes = 5000, 
            depth=2, 
            breadth=4,
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="../models/" + "model_lookahead_75000_episodes_core.pth")
save_memory(model, filename="../models/" + "model_lookahead_75000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

{1: 101, -1: 81, 0: 68} tensor(0.0004, device='cuda:0') 0.0644335327559326


  0%|          | 0/250 [00:00<?, ?it/s]

{1: 100, -1: 90, 0: 60} tensor(0.0004, device='cuda:0') 0.06629150565787706


  0%|          | 0/250 [00:00<?, ?it/s]

{1: 101, -1: 94, 0: 55} tensor(0.0005, device='cuda:0') 0.07035837776835009


  0%|          | 0/250 [00:00<?, ?it/s]

{1: 100, -1: 103, 0: 47} tensor(0.0004, device='cuda:0') 0.07073482635566657


  0%|          | 0/250 [00:00<?, ?it/s]

{1: 110, -1: 98, 0: 42} tensor(0.0003, device='cuda:0') 0.07188392393070565


In [3]:
model = load_checkpoint("../models/" + "model_lookahead_75000_episodes_core.pth", 
                        "../models/" + "model_lookahead_75000_episodes_memory.pth", 
                        model)


agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=100, 
                                                       filter_blunder=False), 
                             num_games=250,
                             temp = 0.25,
                             transition_decay=0.95,
                             depth=2,
                             breadth=4)

model.train(num_episodes = 5000, 
            depth=2, 
            breadth=4,
            evaluate_agents = eval_agents,
            freq=1000)

save_core(model, filename="../models/" + "model_lookahead_80000_episodes_core.pth")
save_memory(model, filename="../models/" + "model_lookahead_80000_episodes_memory.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

{1: 108, -1: 106, 0: 36} tensor(0.0004, device='cuda:0') 0.07344761124079133


  0%|          | 0/250 [00:00<?, ?it/s]

{1: 88, -1: 137, 0: 25} tensor(0.0005, device='cuda:0') 0.07605241192048233


  0%|          | 0/250 [00:00<?, ?it/s]

{1: 86, -1: 126, 0: 38} tensor(0.0006, device='cuda:0') 0.07562150319116008


  0%|          | 0/250 [00:00<?, ?it/s]

{1: 80, -1: 128, 0: 42} tensor(0.0005, device='cuda:0') 0.07511992738252198


  0%|          | 0/250 [00:00<?, ?it/s]

{1: 98, -1: 106, 0: 46} tensor(0.0003, device='cuda:0') 0.07214045022544634
