# SODOKU ReinforceLearning :3

In [1]:
import numpy as np

In [2]:
import random

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Se está utilizando el dispositivo:', device)

Se está utilizando el dispositivo: cuda


#### Procesamiento:

In [5]:
# def encode_board(board):
#     encoded_board = np.zeros((9, 9, 10))
#     # encoded_board = np.zeros((1, 9, 9))
#     for i in range(9):
#         for j in range(9):
#             if board[i, j] != 0:
#                 encoded_board[i, j, board[i, j] - 1] = 1
#             else:
#                 encoded_board[i, j, 9] = 1
#     return encoded_board

In [6]:
def encode_board(board):
    encoded_board = np.zeros((1, 9, 9, 10))  # Agregamos una dimensión adicional al principio
    for i in range(9):
        for j in range(9):
            if board[i, j] != 0:
                encoded_board[0, i, j, board[i, j] - 1] = 1  # Indexamos con la dimensión adicional
            else:
                encoded_board[0, i, j, 9] = 1
    # return encoded_board
    return encoded_board[np.newaxis, :]

#### RED NEURONAL

In [7]:
# Pueden ser otras estructuras, peor por primer ejemplo tomamos esta :3
class SudokuDQN(nn.Module):
    def __init__(self):
        super(SudokuDQN, self).__init__()
        self.conv1 = nn.Conv3d(1, 64, kernel_size=(9, 3, 3), padding=(0, 1, 1))
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv3d(64, 64, kernel_size=(1, 3, 3), padding=(0, 1, 1))
        self.relu2 = nn.ReLU(inplace=True)
        self.conv3 = nn.Conv3d(64, 64, kernel_size=(1, 3, 3), padding=(0, 1, 1))
        self.relu3 = nn.ReLU(inplace=True)
        self.fc = nn.Linear(64 * 9 * 10, 81)

    def forward(self, x):
        x = self.relu1(self.conv1(x))
        x = self.relu2(self.conv2(x))
        x = self.relu3(self.conv3(x))
        print('Shape conv x :', x.shape)
        
        x = x.view(x.size(0), -1)
        print('Shape view x :', x.shape)
        
        x = self.fc(x)
        print('Shape fc x :', x.shape)
        return x

#### Aprendizaje

In [8]:
def select_action(state, model, epsilon):
    if random.random() < epsilon:
        return random.randint(0, 8)  # acción aleatoria
    else:
        with torch.no_grad():
            q_values = model(state)
            return q_values.argmax().item()  # mejor acción según los valores Q

#### Entrenamiento

In [9]:
model = SudokuDQN().to(device)

In [10]:
from torchsummary import summary

encoded_board = encode_board(np.zeros((9, 9)))
input_dim = encoded_board.shape

summary(model, input_size=input_dim[1:], batch_size=-1)

Shape conv x : torch.Size([2, 64, 1, 9, 10])
Shape view x : torch.Size([2, 5760])
Shape fc x : torch.Size([2, 81])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1         [-1, 64, 1, 9, 10]           5,248
              ReLU-2         [-1, 64, 1, 9, 10]               0
            Conv3d-3         [-1, 64, 1, 9, 10]          36,928
              ReLU-4         [-1, 64, 1, 9, 10]               0
            Conv3d-5         [-1, 64, 1, 9, 10]          36,928
              ReLU-6         [-1, 64, 1, 9, 10]               0
            Linear-7                   [-1, 81]         466,641
Total params: 545,745
Trainable params: 545,745
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.26
Params size (MB): 2.08
Estimated Total Size (MB): 2.35
--------------------------------------------------------

In [11]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

#### Función de muestreo de acción basada en probabilidad

In [12]:
def sample_action(prob_dist):
    action = torch.multinomial(prob_dist, 1).item()
    return action

#### Entrenamiento con RL

In [13]:
from sudoku import Sudoku

def generate_sudoku(difficulty='easy'):
    if difficulty == 'easy':
        difficulty_value = 0.2
    elif difficulty == 'medium':
        difficulty_value = 0.5
    elif difficulty == 'hard':
        difficulty_value = 0.7
    else:
        raise ValueError("Invalid difficulty level")
    
    puzzle = Sudoku(difficulty=difficulty_value)
    board = puzzle.board
    return board

In [14]:
generate_sudoku()

[[None, None, None, None, None, None, None, None, 9],
 [None, None, None, None, None, None, 7, None, None],
 [None, 2, None, None, None, None, None, None, None],
 [None, None, None, None, None, None, None, 8, None],
 [None, None, None, 4, None, None, None, None, None],
 [None, None, None, None, None, 6, None, None, None],
 [None, None, None, None, 5, None, None, None, None],
 [1, None, None, None, None, None, None, None, None],
 [None, None, 3, None, None, None, None, None, None]]

In [15]:
num_episodes = 1000
gamma = 0.99  # factor de descuento
max_steps = 200

for episode in range(num_episodes):
    board = np.array(generate_sudoku()).astype('float32')
    board[np.isnan(board)] = 0
    board = board.astype('int')
    encoded_board = encode_board(board)
    encoded_board = torch.tensor(encoded_board, dtype=torch.float32).unsqueeze(0).to(device)  # Agrega la dimensión de batch y mueve el tensor al dispositivo
    print(encoded_board.shape)


    encoded_board = torch.tensor(encoded_board, dtype=torch.float32).unsqueeze(0).to(device)  # Agrega la dimensión de batch y mueve el tensor al dispositivo
    log_probs = []
    rewards = []

    # Jugar un episodio completo de Sudoku
    for t in range(max_steps):
        prob_dist = model(encoded_board).cpu().detach().squeeze(0)  # Obtiene la distribución de probabilidad de las acciones
        action = sample_action(prob_dist)  # Muestrea una acción de la distribución de probabilidad

        next_board, reward, done = step(board, action)  # Asume que la función `step()` aplica la acción en el tablero y devuelve el nuevo tablero, la recompensa y si el juego terminó
        rewards.append(reward)
        log_prob = torch.log(prob_dist[action])
        log_probs.append(log_prob)

        if done:
            break

        board = next_board
        encoded_board = encode_board(board)
        encoded_board = torch.tensor(encoded_board, dtype=torch.float32).unsqueeze(0).to(device)

    # Calcula la pérdida y actualiza los pesos del modelo
    G = 0
    loss = 0
    for t in reversed(range(len(rewards))):
        G = gamma * G + rewards[t]
        loss -= log_probs[t] * G

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (episode + 1) % 100 == 0:
        print(f'Episodio {episode + 1}, pérdida: {loss.item()}')

torch.Size([1, 1, 1, 9, 9, 10])


  encoded_board = torch.tensor(encoded_board, dtype=torch.float32).unsqueeze(0).to(device)  # Agrega la dimensión de batch y mueve el tensor al dispositivo


RuntimeError: Expected 4D (unbatched) or 5D (batched) input to conv3d, but got input of size: [1, 1, 1, 1, 9, 9, 10]