# Curriculum Learning

In [1]:
import sys
sys.path.insert(0, '../../src/')

import numpy as np
import matplotlib.pyplot as plt
import pickle
import config
import torch
from tqdm.notebook import tqdm
from copy import copy, deepcopy
import cmath
import chess
from utils import saver, loader

from agents import *
from environments import *
from models import *
%matplotlib inline

np.set_printoptions(precision = 3)

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


In [2]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
agent = Agent(board_logic = BoardLogic(), in_ch=14, ch=128, n_blocks=10)
print(sum(p.numel() for p in agent.online_net1.parameters() if p.requires_grad))
environment = Environment(max_num_moves=200,)

opt_list = [None, None]

model = Model(agent = agent,
               environment = environment,
               mem_capacity = 1000000,
               batch_size = 512,
               num_warmup = 100000,
               policy_update = 2,
               tau = 0.01,
               temp_scaler = TemperatureScaler(temp_start=0.6, 
                                               temp_end=0.3, 
                                               temp_min=1e-5, 
                                               episode_decay=5000, 
                                               transition_decay=0.95),
               opt_list=opt_list,
               scaler=torch.amp.GradScaler("cuda")
             )

optimizer_grouped_parameters1 = group_decay_parameters(
    agent.online_net1,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

optimizer_grouped_parameters2 = group_decay_parameters(
    agent.online_net2,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

opt_list[0] = torch.optim.AdamW(optimizer_grouped_parameters1, lr=1e-4)
opt_list[1] = torch.optim.AdamW(optimizer_grouped_parameters2, lr=1e-4)


eval_agents = EvaluateAgents(agent1 = agent, 
                             agent2 = deepcopy(agent), 
                             environment = Environment(max_num_moves=200,), 
                             num_games=200,
                             temp = 0.25)


2980300


In [5]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_5000_episodes_large.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

{1: 27, -1: 37, 0: 36} None 1.493878228491088


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 28, -1: 29, 0: 43} tensor(0.0011, device='cuda:0') 1.081297152038855


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 44, -1: 22, 0: 34} tensor(0.0005, device='cuda:0') 1.132863427680351


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 39, -1: 25, 0: 36} tensor(0.0003, device='cuda:0') 1.1178996105969303


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 51, -1: 21, 0: 28} tensor(0.0010, device='cuda:0') 0.9628731062551613


In [6]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_10000_episodes_large.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

{1: 49, -1: 32, 0: 19} tensor(0.0010, device='cuda:0') 0.855189287377631


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 52, -1: 35, 0: 13} tensor(0.0013, device='cuda:0') 0.7794779815248164


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 60, -1: 23, 0: 17} tensor(0.0020, device='cuda:0') 0.6365601338034669


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 71, -1: 22, 0: 7} tensor(0.0014, device='cuda:0') 0.5928371940208308


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 74, -1: 20, 0: 6} tensor(0.0015, device='cuda:0') 0.6694762774045103


In [7]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_15000_episodes_large.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  temp = np.exp(random.uniform(np.log(self.temp_min), np.log(temp_max)))
  temp = np.exp(random.uniform(np.log(self.temp_min), np.log(temp_max)))


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 63, -1: 19, 0: 18} tensor(0.0021, device='cuda:0') 1.2037272134206618


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 76, -1: 8, 0: 16} tensor(0.0477, device='cuda:0') 0.7979454168050837


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 77, -1: 4, 0: 19} tensor(0.1228, device='cuda:0') 0.7085647642204206


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 79, -1: 6, 0: 15} tensor(0.1814, device='cuda:0') 0.935481687289628


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 81, -1: 6, 0: 13} tensor(0.1514, device='cuda:0') 0.9827070138618058


In [3]:
model = load_checkpoint("model_conv_15000_episodes_large.pth", model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_20000_episodes_large.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

{1: 36, -1: 38, 0: 26} tensor(0.0162, device='cuda:0') 0.9601022767052492


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 37, -1: 32, 0: 31} tensor(0.0765, device='cuda:0') 0.8221336855697072


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 43, -1: 24, 0: 33} tensor(0.0192, device='cuda:0') 0.7061614217511094


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 49, -1: 32, 0: 19} tensor(0.0067, device='cuda:0') 0.5519565954337848


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 42, -1: 32, 0: 26} tensor(0.0045, device='cuda:0') 0.4910029642335249


In [None]:
model = load_checkpoint("model_conv_20000_episodes_large.pth", model)

agent1 = model.agent
with torch.no_grad():
    agent2 = deepcopy(agent1)


eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_25000_episodes_large.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

## Eval

In [6]:
model = load_checkpoint("model_conv_20000_episodes_large.pth", model)
agent1 = model.agent
model = load_checkpoint("model_conv_15000_episodes_large.pth", model)
agent2 = model.agent

eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

results = eval_agents.evaluate()
print(results)

  0%|          | 0/100 [00:00<?, ?it/s]

{1: 71, -1: 0, 0: 29}


In [3]:
model = load_checkpoint("model_conv_20000_episodes_large.pth", model)
agent1 = model.agent
model = load_checkpoint("model_conv_10000_episodes_large.pth", model)
agent2 = model.agent

eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.1)

results = eval_agents.evaluate()
print(results)

  0%|          | 0/100 [00:00<?, ?it/s]

{1: 75, -1: 0, 0: 25}


## Play

In [136]:
environment = Environment(max_num_moves=200,)
environment.reset()
#random.seed(42)
#np.random.seed(42)
#torch.manual_seed(42)
temp = 0.01

In [179]:
action = agent.select_action(environment, temp=temp, greedy=False)
move = agent.action_to_move(action)

board, (reward, done) = environment.step(move)

state = agent.board_logic.board_to_state(board).to(config.device)

Q1 = agent.online_net1(state).detach()
Q2 = agent.online_net2(state).detach()
legal_moves = environment.get_legal_moves()
mask_legal = agent.get_mask_legal(legal_moves)

Q1_legal = Q1[mask_legal]
Q2_legal = Q2[mask_legal]

diff = torch.abs(Q1_legal - Q2_legal)/torch.max(torch.abs(Q1_legal), torch.abs(Q2_legal))

print(f"{np.mean(diff.cpu().numpy()):.4f}")

Q_legal = Q1.masked_fill(~mask_legal, -1e9)
action_star = torch.argmax(Q_legal, dim=1).to(config.device)
score = Q2[0,action_star[0]]

if environment.mirror:
    print("Black:")
    print(f"score: {score.item():.4f}")
    print(board.mirror())
else:
    print("White:")
    print(f"score: {score.item():.4f}")
    print(board)


if board.is_checkmate():
    print("checkmate!")

Q_legal = Q1.masked_fill(~mask_legal, -float('inf'))
#print(Q_legal[Q_legal>-1])

q_max = Q_legal.max(dim=1, keepdim=True).values
print(q_max)

logits = (Q_legal - q_max)/temp
probs = torch.softmax(logits, dim=1)
print(probs[probs>0.05])
temp = temp*0.95
print(f"temp: {temp:.4f}")

0.8416
Black:
score: 0.0404
r . . . Q . . .
p p . b . p p .
. . . . . k . .
q . . p . B p .
. b . N n . . .
. . . . P . . .
P . P . K P . .
R N . . . . . R
tensor([[0.2265]], device='cuda:0')
tensor([0.0521, 0.9479], device='cuda:0')
temp: 0.0011
