In [1]:
import numpy as np
import gym
from gym import spaces
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
class LudoEnv(gym.Env):
    def __init__(self):
        super(LudoEnv, self).__init__()
        self.num_pieces = 4
        self.track_length = 57
        self.action_space = spaces.Discrete(self.num_pieces)
        self.observation_space = spaces.Box(low=0, high=self.track_length, shape=(8,), dtype=np.int32)
        self.reset()

    def reset(self):
        self.positions = {
            1: np.zeros(self.num_pieces, dtype=np.int32),
            2: np.zeros(self.num_pieces, dtype=np.int32)
        }
        self.done = False
        self.current_player = 1
        return self._get_obs()

    def _get_obs(self):
        current = self.positions[self.current_player]
        opponent = self.positions[3 - self.current_player]
        return np.concatenate([current, opponent])
    
    def step(self, action):
        if self.done:
            return self._get_obs(), 0, True, False, {}

        dice = np.random.randint(1, 7)
        reward = 0
        player = self.current_player

        if self.positions[player][action] < self.track_length:
            self.positions[player][action] += dice
            if self.positions[player][action] >= self.track_length:
                self.positions[player][action] = self.track_length
                reward = 50
            else:
                reward = 1

        if np.all(self.positions[player] >= self.track_length):
            self.done = True
            reward = 100

        self.current_player = 3 - self.current_player
        return self._get_obs(), reward, self.done, False, {}


  and should_run_async(code)


In [3]:
class DQN(nn.Module):
    # Initializes the neural network layers.
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 256) # First fully connected layer that maps state to 256 hidden units
        self.fc2 = nn.Linear(256, 256) # Second fully connected layer that processes the 256 hidden units
        self.fc3 = nn.Linear(256, action_size) # Output layer that maps to the Q-values of all possible actions

    #  Defines the forward pass of the network.
    def forward(self, state):
        x = torch.relu(self.fc1(state)) # Apply ReLU activation to the output of the first layer
        x = torch.relu(self.fc2(x)) # Apply ReLU activation to the output of the second layer
        return self.fc3(x) # Output Q-values from the final layer 

In [4]:
class ReplayBuffer:
    # Initialize the buffer with a given capacity
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)# Using deque to store experiences, with a maximum length (capacity)
     # Add a new experience to the buffer (state, action, reward, next_state, done)
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done)) # Append the experience to the buffer

    # Sample a batch of experiences from the buffer
    def sample(self, batch_size):
        transitions = random.sample(self.buffer, batch_size) # Randomly select a batch of experiences
        return zip(*transitions) # Return the experiences separated into different components 

    # Return the current size of the buffer
    def __len__(self):
        return len(self.buffer)
