# Curriculum Learning

In [1]:
import sys
sys.path.insert(0, '../../src/')

import numpy as np
import matplotlib.pyplot as plt
import pickle
import config
import torch
from tqdm.notebook import tqdm
from copy import copy, deepcopy
import cmath
import chess
from utils import saver, loader

from agents import *
from environments import *
from models import *
%matplotlib inline

np.set_printoptions(precision = 3)

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


In [2]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
agent = Agent(board_logic = BoardLogic(), in_ch=14, ch=64, n_blocks=6)
print(sum(p.numel() for p in agent.online_net1.parameters() if p.requires_grad))
environment = Environment(max_num_moves=200,)

opt_list = [None, None]

model = Model(agent = agent,
               environment = environment,
               mem_capacity = 100000,
               batch_size = 256,
               num_warmup = 10000,
               policy_update = 2,
               tau = 0.01,
               temp_scaler = TemperatureScaler(temp_start=0.6, 
                                               temp_end=0.3, 
                                               temp_min=1e-5, 
                                               episode_decay=5000, 
                                               transition_decay=0.95),
               opt_list=opt_list,
               scaler=torch.amp.GradScaler("cuda")
             )

optimizer_grouped_parameters1 = group_decay_parameters(
    agent.online_net1,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

optimizer_grouped_parameters2 = group_decay_parameters(
    agent.online_net2,
    weight_decay=1e-5,
    no_decay=["bias", "GroupNorm.weight"],
    )

opt_list[0] = torch.optim.AdamW(optimizer_grouped_parameters1, lr=1e-4)
opt_list[1] = torch.optim.AdamW(optimizer_grouped_parameters2, lr=1e-4)


eval_agents = EvaluateAgents(agent1 = agent, 
                             agent2 = deepcopy(agent), 
                             environment = Environment(max_num_moves=200,), 
                             num_games=200,
                             temp = 0.25)


456972


In [3]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_5000_episodes_small.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

{1: 41, -1: 18, 0: 41} tensor(0.0006, device='cuda:0') 1.0055442894185622


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 36, -1: 22, 0: 42} tensor(0.0001, device='cuda:0') 0.9958092864821939


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 39, -1: 22, 0: 39} tensor(0.0003, device='cuda:0') 1.0777850253909242


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 51, -1: 14, 0: 35} tensor(0.0001, device='cuda:0') 1.3529276678131685


  0%|          | 0/100 [00:00<?, ?it/s]

{1: 62, -1: 13, 0: 25} tensor(0.0007, device='cuda:0') 0.860756922191849


AttributeError: 'Model' object has no attribute 'memory_pos'

In [None]:
agent1 = model.agent
agent2 = deepcopy(agent1)
eval_agents = EvaluateAgents(agent1 = agent1, 
                             agent2 = agent2, 
                             environment = Environment(max_num_moves=200,), 
                             num_games=100,
                             temp = 0.25)

model.train(num_episodes = 5000, 
            evaluate_agents = eval_agents,
            freq=1000)

save_checkpoint(model, filename="model_conv_10000_episodes_small.pth")

  0%|          | 0/5000 [00:00<?, ?it/s]

In [136]:
environment = Environment(max_num_moves=200,)
environment.reset()
#random.seed(42)
#np.random.seed(42)
#torch.manual_seed(42)
temp = 0.01

In [179]:
action = agent.select_action(environment, temp=temp, greedy=False)
move = agent.action_to_move(action)

board, (reward, done) = environment.step(move)

state = agent.board_logic.board_to_state(board).to(config.device)

Q1 = agent.online_net1(state).detach()
Q2 = agent.online_net2(state).detach()
legal_moves = environment.get_legal_moves()
mask_legal = agent.get_mask_legal(legal_moves)

Q1_legal = Q1[mask_legal]
Q2_legal = Q2[mask_legal]

diff = torch.abs(Q1_legal - Q2_legal)/torch.max(torch.abs(Q1_legal), torch.abs(Q2_legal))

print(f"{np.mean(diff.cpu().numpy()):.4f}")

Q_legal = Q1.masked_fill(~mask_legal, -1e9)
action_star = torch.argmax(Q_legal, dim=1).to(config.device)
score = Q2[0,action_star[0]]

if environment.mirror:
    print("Black:")
    print(f"score: {score.item():.4f}")
    print(board.mirror())
else:
    print("White:")
    print(f"score: {score.item():.4f}")
    print(board)


if board.is_checkmate():
    print("checkmate!")

Q_legal = Q1.masked_fill(~mask_legal, -float('inf'))
#print(Q_legal[Q_legal>-1])

q_max = Q_legal.max(dim=1, keepdim=True).values
print(q_max)

logits = (Q_legal - q_max)/temp
probs = torch.softmax(logits, dim=1)
print(probs[probs>0.05])
temp = temp*0.95
print(f"temp: {temp:.4f}")

0.8416
Black:
score: 0.0404
r . . . Q . . .
p p . b . p p .
. . . . . k . .
q . . p . B p .
. b . N n . . .
. . . . P . . .
P . P . K P . .
R N . . . . . R
tensor([[0.2265]], device='cuda:0')
tensor([0.0521, 0.9479], device='cuda:0')
temp: 0.0011
