# Curriculum Learning

In [1]:
import sys
sys.path.insert(0, '../../src/')

import numpy as np
import matplotlib.pyplot as plt
import pickle
import config
import torch
from tqdm.notebook import tqdm
from copy import copy, deepcopy
import cmath
import chess
from utils import saver, loader

from agents import *
from environments import *
from models import *
%matplotlib inline

np.set_printoptions(precision = 3)

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


In [2]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
agent = Agent(board_logic = BoardLogic(), in_ch=14, ch=128, n_blocks=10)
print(sum(p.numel() for p in agent.online_net1.parameters() if p.requires_grad))
environment = Environment(max_num_moves=200,)

opt_list = [None, None]

model = Model(agent = agent,
               environment = environment,
               mem_capacity = 1000000,
               batch_size = 512,
               num_warmup = 100000,
               policy_update = 2,
               tau = 0.01,
               temp_scaler = TemperatureScaler(temp_start=0.6, 
                                               temp_end=0.3, 
                                               temp_min=1e-5, 
                                               episode_decay=5000, 
                                               transition_decay=0.95),
               opt_list=opt_list,
               scaler=torch.amp.GradScaler("cuda")
             )

optimizer_grouped_parameters1 = group_decay_parameters(
    agent.online_net1,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

optimizer_grouped_parameters2 = group_decay_parameters(
    agent.online_net2,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

opt_list[0] = torch.optim.AdamW(optimizer_grouped_parameters1, lr=1e-4)
opt_list[1] = torch.optim.AdamW(optimizer_grouped_parameters2, lr=1e-4)


eval_agents = EvaluateAgents(agent1 = agent, 
                             agent2 = deepcopy(agent), 
                             environment = Environment(max_num_moves=200,), 
                             num_games=200,
                             temp = 0.25)


2980300


In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_5000_episodes_large.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_10000_episodes_large.pth")

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_15000_episodes_large.pth")

In [None]:
model = load_checkpoint("model_conv_15000_episodes_large.pth", model)

agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_20000_episodes_large.pth")

In [None]:
model = load_checkpoint("model_conv_20000_episodes_large.pth", model)

agent1 = model.agent
agent2 = deepcopy(agent1)

eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_25000_episodes_large.pth")

In [None]:
model = load_checkpoint("model_conv_25000_episodes_large.pth", model)

agent1 = model.agent
agent2 = deepcopy(agent1)

eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_30000_episodes_large.pth")

In [None]:
model = load_checkpoint("model_conv_30000_episodes_large.pth", model)

agent1 = model.agent
agent2 = deepcopy(agent1)

eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=500,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_35000_episodes_large.pth")

In [7]:
model = load_checkpoint("model_conv_35000_episodes_large.pth", model)

agent1 = model.agent
agent2 = deepcopy(agent1)
model.environment.filter_blunder = True


eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200, 
                                                      filter_blunder=True), 
                             num_games=200,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_40000_episodes_large.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

{1: 38, -1: 34, 0: 128} tensor(0.0013, device='cuda:0') 0.11869323918796784


  0%|          | 0/200 [00:00<?, ?it/s]

{1: 33, -1: 24, 0: 143} tensor(0.0009, device='cuda:0') 0.11177589415627368


KeyboardInterrupt: 

In [8]:
print(eval_agents.evaluate())

  0%|          | 0/200 [00:00<?, ?it/s]

{1: 41, -1: 32, 0: 127}


## Eval

In [3]:
model = load_checkpoint("model_conv_35000_episodes_large.pth", model)
agent1 = deepcopy(model.agent)
model = load_checkpoint("model_conv_35000_episodes_large.pth", model)
agent2 = model.agent

eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200), 
                             num_games=100,
                             temp = 0.25)

results = eval_agents.evaluate()
print(results)

  0%|          | 0/100 [00:00<?, ?it/s]

{1: 33, -1: 36, 0: 31}


In [3]:
model = load_checkpoint("model_conv_35000_episodes_large.pth", model)
agent1 = deepcopy(model.agent)
model = load_checkpoint("model_conv_35000_episodes_large.pth", model)
agent2 = model.agent

eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,filter_blunder=True), 
                             num_games=100,
                             temp = 0.25)

results = eval_agents.evaluate()
print(results)

  0%|          | 0/100 [00:00<?, ?it/s]

{1: 18, -1: 19, 0: 63}


## Play

In [294]:
model = load_checkpoint("model_conv_35000_episodes_large.pth", model)

In [319]:
environment = Environment(max_num_moves=200,filter_blunder=True)
environment.reset()
#random.seed(42)
#np.random.seed(42)
#torch.manual_seed(42)
temp = 0.1

In [334]:
action = agent.select_action(environment, temp=temp, greedy=False)
move = agent.action_to_move(action)

board, (reward, done) = environment.step(move)

state = agent.board_logic.board_to_state(board).to(config.device)

Q1 = agent.online_net1(state).detach()
Q2 = agent.online_net2(state).detach()
legal_moves = environment.get_legal_moves()
mask_legal = agent.get_mask_legal(legal_moves)

Q1_legal = Q1[mask_legal]
Q2_legal = Q2[mask_legal]

diff = torch.abs(Q1_legal - Q2_legal)/torch.max(torch.abs(Q1_legal), torch.abs(Q2_legal))

print(f"{np.mean(diff.cpu().numpy()):.4f}")

Q_legal = Q1.masked_fill(~mask_legal, -1e9)
action_star = torch.argmax(Q_legal, dim=1).to(config.device)
score = Q2[0,action_star[0]]

if environment.mirror:
    print("Black:")
    print(f"score: {score.item():.4f}")
    print(board.mirror())
else:
    print("White:")
    print(f"score: {score.item():.4f}")
    print(board)


if board.is_checkmate():
    print("checkmate!")

Q_legal = Q1.masked_fill(~mask_legal, -float('inf'))
#print(Q_legal[Q_legal>-1])

q_max = Q_legal.max(dim=1, keepdim=True).values
print(q_max)

logits = (Q_legal - q_max)/temp
print(torch.sort(logits, descending=True).values[0,:10])
probs = torch.softmax(logits, dim=1)
print(probs[logits>-1])
temp = temp*0.95
print(f"temp: {temp:.4f}")

0.2543
Black:
score: -0.2364
r n b q . b . r
p p p p . k . .
. . . . . n . p
. . . . P p . P
. . . . . P . .
. . . . P . . .
P P P . . . . .
R N B Q K B N R
tensor([[-0.1655]], device='cuda:0')
tensor([ 0.0000, -0.2062, -0.3657, -0.4584, -0.5254, -0.5477, -0.6247, -0.6523,
        -0.6536, -0.6556], device='cuda:0')
tensor([0.0881, 0.0557, 0.0457, 0.0336, 0.0456, 0.0459, 0.0427, 0.0435, 0.0458,
        0.0509, 0.0361, 0.0717, 0.0472, 0.0521, 0.0611], device='cuda:0')
temp: 0.0463


In [281]:
environment.board.is_insufficient_material()

True

In [16]:
environment.move_count

10