In [None]:
%load_ext autoreload
%autoreload 2
from Agent import Agent, MoEAgent
from SudokuEnvironment import SudokuEnvironment
from torch.utils.tensorboard import SummaryWriter
import torch

In [None]:
device = 'cpu'
is_9x9_sudoku = False
if is_9x9_sudoku:
  n_actions = 729
  n_values = 10
  input_dim = 81
  with_legal_actions = True
  env = SudokuEnvironment(size=9, with_legal_actions=with_legal_actions)
else:
  # 6x6 sudoku
  n_actions = 216
  n_values = 7
  input_dim = 36
  with_legal_actions = True
  env = SudokuEnvironment(size=6, box_height=2, box_width=3, with_legal_actions=with_legal_actions)



# Sparse Reward and Dense Reward

In [None]:
sparse_reward = True

In [None]:
agent = Agent(
  n_actions=n_actions,
  n_values=n_values,
  input_dim=input_dim,
  train_mode=True,
  device=device
)

In [None]:
n_episodes = 50
n_empty_cells = 20
n_trials = 81 if is_9x9_sudoku else 36
sudoku_variants = 81 if is_9x9_sudoku else 81

run_name = input("Run name: ")
writer = SummaryWriter(comment=run_name)

agent.enable_train_mode()
if sparse_reward:
  env.set_sparse_reward()
else:
  env.set_dense_reward()

for _ in range(20):
  env.create_new()
  agent.reset_epsilon()
  for episode in range(n_episodes):
    # create 81 sudoku variants starting from the same solution
    mean_reward = 0
    wins = 0
    easy_wins = 0
    mean_loss = 0
    for sudoku_variants_id in range(sudoku_variants):
      env.add_remove_cells(n_empty_cells)
      n_trial_loss_sum = 0
      trial_loss = 0
      for trial in range(n_trials):
        state = env.grid.unsqueeze(0).clone()
        legal_actions = env.get_legal_actions()
        action = agent.take_action(state, legal_actions)
        next_state, reward, completed, win, next_legal_actions = env.step(action)
        agent.save_experience(state.squeeze(0), legal_actions, action, reward, next_state, next_legal_actions, completed)
        loss = agent.learn()
        if loss != None:
          trial_loss += loss
          n_trial_loss_sum += 1
        legal_actions = legal_actions.reshape(-1, 9)
        one_valid_action_per_cell = (legal_actions.sum(dim=1) == 1).sum().item()
        empty_cells = (state == 0).sum().item()  
        if trial == 0 and one_valid_action_per_cell == empty_cells:
          easy_wins += 1
        wins += win
        mean_reward += reward
        if completed:
          break
      mean_loss += trial_loss/(n_trial_loss_sum + 1e-15)
      env.reset()
    mean_loss /= sudoku_variants
    mean_reward /= sudoku_variants
    writer.add_scalar('loss', mean_loss)
    writer.add_scalar('wins', wins)
    writer.add_scalar('easy_wins', easy_wins)
    writer.add_scalar('reward', mean_reward)
    writer.add_scalar('n_empty_cells', n_empty_cells)
    # print(f"[Episode {episode + 1}/{n_episodes}][{n_empty_cells}] Loss: {mean_loss:.15f} \t Reward: {mean_reward:.1f} \t Wins: {wins}/81 \t Epsilon: {agent.epsilon} \t Easy wins: {easy_wins}")


In [None]:
import torch
torch.save(agent.behaviour_net.state_dict(), './weights/sparse_normal_learning_6x6_20.pt')

# Curriculum Learning

In [None]:
agent = Agent(
  n_actions=n_actions,
  n_values=n_values,
  input_dim=input_dim,
  train_mode=True,
  device=device
)

In [None]:
n_episodes = 50
n_trials = 81 if is_9x9_sudoku else 36
sudoku_variants = 81 if is_9x9_sudoku else 81
empty_cells_range = (2, 21)

run_name = input("Run name: ")
writer = SummaryWriter(comment=run_name)

loss_log = {}
for k in range(empty_cells_range[0], empty_cells_range[1]):
  loss_log[k] = []

agent.enable_train_mode()

for n_empty_cells in range(empty_cells_range[0], empty_cells_range[1]):
  env.create_new()
  agent.reset_epsilon()
  for episode in range(n_episodes):
    # create 81 sudoku variants starting from the same solution
    mean_reward = 0
    wins = 0
    easy_wins = 0
    mean_loss = 0
    for sudoku_variants_id in range(sudoku_variants):
      env.add_remove_cells(n_empty_cells)
      n_trial_loss_sum = 0
      trial_loss = 0
      for trial in range(n_trials):
        state = env.grid.unsqueeze(0).clone()
        legal_actions = env.get_legal_actions()
        action = agent.take_action(state, legal_actions)
        next_state, reward, completed, win, next_legal_actions = env.step(action)
        agent.save_experience(state.squeeze(0), legal_actions, action, reward, next_state, next_legal_actions, completed)
        loss = agent.learn()
        if loss != None:
          trial_loss += loss
          n_trial_loss_sum += 1
        legal_actions = legal_actions.reshape(-1, 9)
        one_valid_action_per_cell = (legal_actions.sum(dim=1) == 1).sum().item()
        empty_cells = (state == 0).sum().item()  
        if trial == 0 and one_valid_action_per_cell == empty_cells:
          easy_wins += 1
        wins += win
        mean_reward += reward
        if completed:
          break
      mean_loss += trial_loss/(n_trial_loss_sum + 1e-15)
      env.reset()
    mean_loss /= sudoku_variants
    mean_reward /= sudoku_variants
    loss_log[n_empty_cells].append(mean_loss)
    writer.add_scalar('loss', mean_loss)
    writer.add_scalar('wins', wins)
    writer.add_scalar('easy_wins', easy_wins)
    writer.add_scalar('reward', mean_reward)
    writer.add_scalar('n_empty_cells', n_empty_cells)
    # print(f"[Episode {episode + 1}/{n_episodes}][{n_empty_cells}] Loss: {mean_loss:.15f} \t Reward: {mean_reward:.1f} \t Wins: {wins}/81 \t Epsilon: {agent.epsilon} \t Easy wins: {easy_wins}")


In [None]:
writer.flush()
writer.close()

In [None]:
import torch
torch.save(agent.behaviour_net.state_dict(), 'sparse_curriculum_learning_6x6_20.pt')

# MoE Agent

In [None]:
agent = MoEAgent(
  n_actions=n_actions,
  n_values=n_values,
  input_dim=input_dim,
  train_mode=True,
  device=device
)

In [None]:
n_episodes = 25
n_trials = 81 if is_9x9_sudoku else 36
sudoku_variants = 81 if is_9x9_sudoku else 81
empty_cells_range = (2, 31)

run_name = input("Run name: ")
writer = SummaryWriter(comment=run_name)

loss_log = {}
for k in range(empty_cells_range[0], empty_cells_range[1]):
  loss_log[k] = []

agent.enable_train_mode()

for n_empty_cells in range(empty_cells_range[0], empty_cells_range[1]):
  env.create_new()
  agent.reset_epsilon()
  if n_empty_cells % agent.empty_cells_group == 0: # if you start from 1 change it
    agent.clone_expert((n_empty_cells // agent.empty_cells_group) - 1)
    agent.align_target_behaviour_nets()
    agent.empty_buffer()
  for _ in range(3):
    for episode in range(n_episodes):
      # create 81 sudoku variants starting from the same solution
      mean_reward = 0
      wins = 0
      mean_loss = 0
      for sudoku_variants_id in range(sudoku_variants):
        env.add_remove_cells(n_empty_cells)
        n_trial_loss_sum = 0
        trial_loss = 0
        # for trial in range(n_trials):
        while n_empty_cells - (n_empty_cells % agent.empty_cells_group) <= (env.grid == 0).sum() < n_empty_cells - (n_empty_cells % agent.empty_cells_group) + agent.empty_cells_group:
          state = env.grid.unsqueeze(0).clone()
          legal_actions = env.get_legal_actions()
          action = agent.take_action(state, legal_actions)
          next_state, reward, completed, win, next_legal_actions = env.step(action)
          agent.save_experience(state.squeeze(0), legal_actions, action, reward, next_state, next_legal_actions, completed)
          loss = agent.learn()
          if loss != None:
            trial_loss += loss
            n_trial_loss_sum += 1
          legal_actions = legal_actions.reshape(-1, 9)
          one_valid_action_per_cell = (legal_actions.sum(dim=1) == 1).sum().item()
          empty_cells = (state == 0).sum().item()  
          wins += win
          mean_reward += reward
          if completed:
            break
        mean_loss += trial_loss/(n_trial_loss_sum + 1e-15)
        env.reset()
      mean_loss /= sudoku_variants
      mean_reward /= sudoku_variants
      loss_log[n_empty_cells].append(mean_loss)
      writer.add_scalar('loss', mean_loss)
      writer.add_scalar('wins', wins)
      writer.add_scalar('easy_wins', easy_wins)
      writer.add_scalar('reward', mean_reward)
      writer.add_scalar('n_empty_cells', n_empty_cells)
      # print(f"[Episode {episode + 1}/{n_episodes}][{n_empty_cells}] Loss: {mean_loss:.15f} \t Reward: {mean_reward:.1f} \t Wins: {wins}/81 \t Epsilon: {agent.epsilon} \t Easy wins: {easy_wins}")


In [None]:
writer.flush()
writer.close()

In [None]:
import torch
torch.save(agent.behaviour_net.state_dict(), 'moe-sparse_curriculum_learning_9x9_30.pt')

# Testing

In [None]:
agent = MoEAgent(
  n_actions=n_actions,
  n_values=n_values,
  input_dim=input_dim,
  train_mode=True,
  device=device
)

In [None]:
import torch
state_dict = torch.load("moe-sparse_curriculum_learning_9x9_30.pt")
agent.behaviour_net.load_state_dict(state_dict)
agent.target_net.load_state_dict(state_dict)

In [165]:
agent.disable_train_mode()

In [171]:
from IPython.display import clear_output

wins = 0
easy_wins = 0
for i in range(100):
  env.create_new()
  env.add_remove_cells(15)
  done = False
  i = 0
  while not done:
    state = env.grid.unsqueeze(0).clone()
    legal_actions = env.get_legal_actions()
    action = agent.take_action(state, legal_actions)
    next_state, reward, done, win, next_legal_actions = env.step(action)
    legal_actions = legal_actions.reshape(-1, 9)
    one_valid_action_per_cell = (legal_actions.sum(dim=1) == 1).sum().item()
    empty_cells = (state == 0).sum().item()
    if i == 0 and one_valid_action_per_cell == empty_cells:
      easy_wins += 1
      wins += 1
      break
    i += 1
  wins += win
  
print(f"Rateo: {wins}/100")
print(f"Easy wins: {easy_wins}/100")

Rateo: 18/100
Easy wins: 0/100


In [None]:
agent.train_mode