# Curriculum Learning

In [1]:
import sys
sys.path.insert(0, '../../src/')

import numpy as np
import matplotlib.pyplot as plt
import pickle
import config
import torch
from tqdm.notebook import tqdm
from copy import copy, deepcopy
import cmath
import chess
from utils import saver, loader

from agents import *
from environments import *
from models import *
%matplotlib inline

np.set_printoptions(precision = 3)

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


In [2]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
agent = Agent(board_logic = BoardLogic(), in_ch=14, ch=128, n_blocks=8)
print(sum(p.numel() for p in agent.online_net1.parameters() if p.requires_grad))

opt_list = [None, None]

model = Model(agent = agent,
               environment = Environment(max_num_moves=200,),
               mem_capacity = 250000,
               batch_size = 512,
               num_warmup = 25000,
               policy_update = 2,
               target_update = 5000,
               temp_constants = (0.6, 0.3, 1e-3, 10000),
               opt_list=opt_list,
               scaler=torch.amp.GradScaler("cuda")
             )

optimizer_grouped_parameters1 = group_decay_parameters(
    agent.online_net1,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

optimizer_grouped_parameters2 = group_decay_parameters(
    agent.online_net2,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

opt_list[0] = torch.optim.AdamW(optimizer_grouped_parameters1, lr=1e-4)
opt_list[1] = torch.optim.AdamW(optimizer_grouped_parameters2, lr=1e-4)


2389452


In [3]:
model = load_checkpoint("model_conv_25000_episodes_large.pth", model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_30000_episodes_large.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
  torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 196, -1: 165, 0: 139} tensor(0.0052, device='cuda:0') 0.5038553457431083


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 177, -1: 184, 0: 139} tensor(0.0052, device='cuda:0') 0.4751294122907916


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 192, -1: 180, 0: 128} tensor(0.0032, device='cuda:0') 0.5079688577293404


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 169, -1: 188, 0: 143} tensor(0.0073, device='cuda:0') 0.37787426075263414


  0%|          | 0/500 [00:00<?, ?it/s]

{1: 171, -1: 182, 0: 147} tensor(0.0066, device='cuda:0') 0.5027590940070181


KeyboardInterrupt: 

In [4]:
save_checkpoint(model, filename="model_conv_30000_episodes_large.pth")

In [28]:
environment = Environment(max_num_moves=200,)
environment.reset()
#random.seed(42)
#np.random.seed(42)
#torch.manual_seed(42)
temp = 0.1

In [None]:
action = agent.select_action(environment, temp=temp, greedy=False)
move = agent.action_to_move(action)

board, (reward, done) = environment.step(move)

state = agent.board_logic.board_to_state(board).to(config.device)

Q1 = agent.online_net1(state).detach()
Q2 = agent.online_net2(state).detach()
legal_moves = environment.get_legal_moves()
mask_legal = agent.get_mask_legal(legal_moves)

Q1_legal = Q1[mask_legal]
Q2_legal = Q2[mask_legal]

diff = torch.abs(Q1_legal - Q2_legal)/torch.max(torch.abs(Q1_legal), torch.abs(Q2_legal))

print(f"{np.mean(diff.cpu().numpy()):.4f}")

Q_legal = Q1.masked_fill(~mask_legal, -1e9)
action_star = torch.argmax(Q_legal, dim=1).to(config.device)
score = Q2[0,action_star[0]]

if environment.mirror:
    print("Black:")
    print(f"score: {score.item():.4f}")
    print(board.mirror())
else:
    print("White:")
    print(f"score: {score.item():.4f}")
    print(board)


if board.is_checkmate():
    print("checkmate!")

Q_legal = Q1.masked_fill(~mask_legal, -float('inf'))
#print(Q_legal[Q_legal>-1])

q_max = Q_legal.max(dim=1, keepdim=True).values
#print(q_max)

logits = (Q_legal - q_max)/temp
probs = torch.softmax(logits, dim=1)
print(logits[logits>-1])
print(probs[probs>0.05])
temp = temp*0.95
print(f"temp: {temp:.4f}")

1.0256
Black:
score: -0.1093
r n b q k b n .
. p p . p p p .
p . . . . . . r
. . . p . . . P
. . . . . . . .
. . P P . P . .
P P . . P . . P
R N B Q K B N R
tensor([[-0.1268]], device='cuda:0')
tensor([-0.1537,  0.0000, -0.5983, -0.6836, -0.1277], device='cuda:0')
tensor([0.1212, 0.1413, 0.0777, 0.0713, 0.1244], device='cuda:0')
temp: 0.0630
