In [63]:
!pip install numpy



In [64]:
!pip install tqdm



In [65]:
import numpy as np
from tqdm import tqdm


class DPAgent:
    """
    Agent that uses Dynamic Programming for policy computation.
    """

    def __init__(self, environment):
        self.environment = environment
        self.all_states = self._enumerate_all_states()
        self.state_map = {state: idx for idx, state in enumerate(self.all_states)}
        self.num_states = len(self.all_states)
        self.num_actions = environment.action_space.n
        self.value_function = np.zeros(self.num_states)
        self.policy = np.zeros(self.num_states, dtype=int)

    def _enumerate_all_states(self):
        """
        Generate all possible states for the Sokoban puzzle.
        """
        states = []
        positions = [
            (i, j) for i in range(self.environment.grid_height) for j in range(self.environment.grid_width)
        ]
        for player_pos in positions:
            for box_pos in positions:
                for target_pos in positions:
                    if (
                        player_pos != box_pos
                        and box_pos != target_pos
                        and player_pos != target_pos
                    ):
                        states.append((player_pos, box_pos, target_pos))
        return states

    def value_iteration(self, gamma=0.99, epsilon=1e-6, max_iterations=1000):
        """
        Perform value iteration to compute the optimal policy.
        """
        for _ in tqdm(range(max_iterations), desc="Value Iteration Progress"):
            delta = 0
            for idx, state in enumerate(self.all_states):
                v = self.value_function[idx]
                action_values = []
                for action in range(self.num_actions):
                    new_state, reward, terminal = self._simulate_step(state, action)
                    next_idx = self.state_map.get(new_state, -1)
                    if next_idx >= 0:
                        action_value = reward + gamma * self.value_function[next_idx] * (not terminal)
                    else:
                        action_value = reward
                    action_values.append(action_value)
                self.value_function[idx] = max(action_values)
                delta = max(delta, abs(v - self.value_function[idx]))
            if delta < epsilon:
                break
        self._derive_policy(gamma)

    def _simulate_step(self, state, action):
        """
        Simulate an environment step for the given state and action.
        """
        self.environment.reset()
        self.environment.set_state(state)
        _, reward, terminal, _ = self.environment.step(action)
        return self.environment.get_state(), reward, terminal

    def _derive_policy(self, gamma):
        """
        Extract the optimal policy from the computed value function.
        """
        for idx, state in enumerate(self.all_states):
            action_values = []
            for action in range(self.num_actions):
                new_state, reward, terminal = self._simulate_step(state, action)
                next_idx = self.state_map.get(new_state, -1)
                if next_idx >= 0:
                    action_value = reward + gamma * self.value_function[next_idx] * (not terminal)
                else:
                    action_value = reward
                action_values.append(action_value)
            self.policy[idx] = np.argmax(action_values)

    def select_action(self, state):
        """
        Select an action based on the computed policy.
        """
        idx = self.state_map.get(state, -1)
        if idx >= 0:
            return self.policy[idx]
        return self.environment.action_space.sample()


In [66]:
class MCAgent:
    """
    Agent that uses Monte Carlo methods for policy computation.
    """

    def __init__(self, environment):
        self.environment = environment
        self.q_values = defaultdict(lambda: np.zeros(environment.action_space.n))
        self.returns = defaultdict(lambda: defaultdict(list))
        self.action_policy = {}
        self.initial_epsilon = 1.0
        self.minimum_epsilon = 0.01
        self.epsilon_decay_rate = 0.995
        self.gamma = 0.99

    def _epsilon_greedy(self, state, epsilon):
        """
        Epsilon-greedy action selection.
        """
        if np.random.rand() < epsilon:
            return self.environment.action_space.sample()
        else:
            return np.argmax(self.q_values[state])

    def generate_episode(self, epsilon):
        """
        Generate an episode using the current policy.
        """
        episode = []
        self.environment.reset()
        terminal = False
        steps = 0
        while not terminal and steps < 100:
            state = self.environment.get_state()
            action = self._epsilon_greedy(state, epsilon)
            _, reward, terminal, _ = self.environment.step(action)
            episode.append((state, action, reward))
            steps += 1
        return episode

    def learn_policy(self, episode_count=1000):
        """
        Train the agent using Monte Carlo methods.
        """
        epsilon = self.initial_epsilon
        for _ in tqdm(range(episode_count), desc="Monte Carlo Training Progress"):
            episode = self.generate_episode(epsilon)
            G = 0
            visited_state_actions = set()
            for state, action, reward in reversed(episode):
                G = self.gamma * G + reward
                if (state, action) not in visited_state_actions:
                    visited_state_actions.add((state, action))
                    self.returns[state][action].append(G)
                    self.q_values[state][action] = np.mean(self.returns[state][action])
                    self.action_policy[state] = np.argmax(self.q_values[state])
            epsilon = max(self.minimum_epsilon, epsilon * self.epsilon_decay_rate)

    def select_action(self, state):
        """
        Select action based on the learned policy.
        """
        return self.action_policy.get(state, self.environment.action_space.sample())


def simulate_episode(environment, agent, max_steps=100, render=False):
    """
    Simulate a single episode with the given agent.

    Args:
        environment: The Sokoban environment instance.
        agent: The agent to use for the simulation.
        max_steps (int): Maximum steps to simulate.
        render (bool): Whether to render the environment.

    Returns:
        tuple: (total reward, total steps)
    """
    state, _ = environment.reset()
    total_return_val = 0
    terminal = False
    steps = 0
    while not terminal and steps < max_steps:
        current_state = environment.get_state()
        action = agent.select_action(current_state)
        state, reward, terminal, _ = environment.step(action)
        total_return_val += reward
        steps += 1
        if render:
            environment.render()
    return total_return_val, steps


def agent_assess(environment, agent, episode_count=1000):
    """
    Assess the agent's performance over multiple episodes.

    Args:
        environment: The Sokoban environment instance.
        agent: The agent to be evaluated.
        episode_count (int): Number of episodes to evaluate.

    Returns:
        tuple: (average reward, average steps)
    """
    return_vals = []
    step_count_list = []
    for _ in range(episode_count):
        total_return_val, steps = simulate_episode(environment, agent)
        return_vals.append(total_return_val)
        step_count_list.append(steps)
    return np.mean(return_vals), np.mean(step_count_list)

In [67]:
!pip install gym



In [68]:
import numpy as np
import gym
from gym import spaces


class SokobanEnv(gym.Env):
    """
    Custom Sokoban environment for reinforcement learning.
    """

    def __init__(self):
        super(SokobanEnv, self).__init__()

        # Define the grid size
        self.grid_height = 6
        self.grid_width = 7

        # Define action and observation space
        # Actions: Up, Down, Left, Right
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(
            low=0, high=5, shape=(self.grid_height, self.grid_width), dtype=np.uint8
        )

        # Define grid elements
        self.EMPTY = 0
        self.WALL = 1
        self.BOX = 2
        self.TARGET = 3
        self.PLAYER = 4
        self.BOX_ON_TARGET = 5

        # Map actions to movements
        self.action_map = {
            0: (-1, 0),  # Up
            1: (1, 0),   # Down
            2: (0, -1),  # Left
            3: (0, 1),   # Right
        }

        self.reset()

    def reset(self):
        """
        Reset the environment to its initial state.
        """
        # Initialize the grid
        self.grid = np.zeros((self.grid_height, self.grid_width), dtype=int)
        self.grid[1:-1, 1:-1] = self.EMPTY  # Set inner area to empty

        # Add walls around the grid
        self.grid[0, :] = self.WALL
        self.grid[-1, :] = self.WALL
        self.grid[:, 0] = self.WALL
        self.grid[:, -1] = self.WALL

        # Place the player at a random empty position
        empty_positions = list(zip(*np.where(self.grid == self.EMPTY)))
        self.player_position = np.array(empty_positions[np.random.choice(len(empty_positions))])
        self.grid[self.player_position[0], self.player_position[1]] = self.PLAYER

        # Place the box at a random empty position
        empty_positions = list(zip(*np.where(self.grid == self.EMPTY)))
        self.box_position = np.array(empty_positions[np.random.choice(len(empty_positions))])
        self.grid[self.box_position[0], self.box_position[1]] = self.BOX

        # Place the target at a random empty position
        empty_positions = list(zip(*np.where(self.grid == self.EMPTY)))
        self.target_position = np.array(empty_positions[np.random.choice(len(empty_positions))])
        self.grid[self.target_position[0], self.target_position[1]] = self.TARGET

        return self.grid.copy(), {}

    def step(self, action):
        """
        Perform an action in the environment.
        Args:
            action (int): The action to take (0: Up, 1: Down, 2: Left, 3: Right).
        Returns:
            tuple: (observation, reward, done, info)
        """
        move = self.action_map[action]
        new_player_pos = self.player_position + move

        # **Bounds Check for new_player_pos**
        if (
            new_player_pos[0] < 0 or new_player_pos[0] >= self.grid_height or
            new_player_pos[1] < 0 or new_player_pos[1] >= self.grid_width
        ):
            return self.grid.copy(), -1, False, {}  # Invalid move: out of bounds

        # Check for wall collision
        if self.grid[new_player_pos[0], new_player_pos[1]] == self.WALL:
            return self.grid.copy(), -1, False, {}  # Invalid move: collision with wall

        # Check if the player is pushing the box
        if np.array_equal(new_player_pos, self.box_position):
            new_box_pos = self.box_position + move

            # Bounds Check for new_box_pos
            if (
                new_box_pos[0] < 0 or new_box_pos[0] >= self.grid_height or
                new_box_pos[1] < 0 or new_box_pos[1] >= self.grid_width or
                self.grid[new_box_pos[0], new_box_pos[1]] in [self.WALL, self.BOX]
            ):
                return self.grid.copy(), -1, False, {}  # Invalid move: collision or out of bounds

            # Move the box
            self.grid[self.box_position[0], self.box_position[1]] = self.EMPTY
            self.box_position = new_box_pos

            if np.array_equal(self.box_position, self.target_position):
                self.grid[self.box_position[0], self.box_position[1]] = self.BOX_ON_TARGET
            else:
                self.grid[self.box_position[0], self.box_position[1]] = self.BOX

        # Move the player
        self.grid[self.player_position[0], self.player_position[1]] = self.EMPTY
        self.player_position = new_player_pos
        self.grid[self.player_position[0], self.player_position[1]] = self.PLAYER

        # Check if the box is on the target
        terminal = np.array_equal(self.box_position, self.target_position)
        reward = 10 if terminal else -1  # Positive reward if solved, negative otherwise

        return self.grid.copy(), reward, terminal, {}





    def render(self, mode="human"):
        """
        Render the environment grid.
        """
        if mode == "human":
            symbols = {
                self.EMPTY: ' ',
                self.WALL: '#',
                self.BOX: '$',
                self.TARGET: '.',
                self.PLAYER: '@',
                self.BOX_ON_TARGET: '*'
            }
            print("\n".join("".join(symbols[cell] for cell in row) for row in self.grid))
        elif mode == "rgb_array":
            # Optional implementation for visualizing the environment
            pass

    def get_state(self):
        """
        Returns a tuple representing the current state.
        """
        return (tuple(self.player_position), tuple(self.box_position), tuple(self.target_position))

    def set_state(self, state):
        """
        Set the environment to a specific state.
        Args:
            state (tuple): A tuple of (player_position, box_position, target_position).
        """
        self.grid = np.zeros((self.grid_height, self.grid_width), dtype=int)
        self.grid[1:-1, 1:-1] = self.EMPTY  # Reset inner area to empty

        # Add walls around the grid
        self.grid[0, :] = self.WALL
        self.grid[-1, :] = self.WALL
        self.grid[:, 0] = self.WALL
        self.grid[:, -1] = self.WALL

        self.player_position = np.array(state[0])
        self.box_position = np.array(state[1])
        self.target_position = np.array(state[2])

        self.grid[self.player_position[0], self.player_position[1]] = self.PLAYER
        self.grid[self.box_position[0], self.box_position[1]] = self.BOX
        self.grid[self.target_position[0], self.target_position[1]] = self.TARGET



In [69]:
import time

def main():
    environment = SokobanEnv()

    print("Training Dynamic Programming Agent...")
    init_time = time.time()
    dp_agent = DPAgent(environment)
    dp_agent.value_iteration()
    dp_training_time = time.time() - init_time
    print(f"Dynamic Programming training completed in {dp_training_time:.2f} seconds")

    print("\nTraining Monte Carlo Agent...")
    init_time = time.time()
    mc_agent = MCAgent(environment)
    mc_agent.learn_policy(episode_count=1000)
    mc_training_time = time.time() - init_time
    print(f"Monte Carlo training completed in {mc_training_time:.2f} seconds")

    print("\nEvaluating Dynamic Programming Agent...")
    dp_avg_reward, dp_avg_steps = agent_assess(environment, dp_agent)
    print(f"Dynamic Programming - Average Reward: {dp_avg_reward:.2f}, Average Steps: {dp_avg_steps:.2f}")

    print("\nEvaluating Monte Carlo Agent...")
    mc_avg_reward, mc_avg_steps = agent_assess(environment, mc_agent)
    print(f"Monte Carlo - Average Reward: {mc_avg_reward:.2f}, Average Steps: {mc_avg_steps:.2f}")

    print("\nComparison:")
    print(f"Training Time - DP: {dp_training_time:.2f}s, MC: {mc_training_time:.2f}s")
    print(f"Average Reward - DP: {dp_avg_reward:.2f}, MC: {mc_avg_reward:.2f}")
    print(f"Average Steps - DP: {dp_avg_steps:.2f}, MC: {mc_avg_steps:.2f}")


if __name__ == "__main__":
    main()

Training Dynamic Programming Agent...


Value Iteration Progress: 100%|██████████| 1000/1000 [3:32:05<00:00, 12.73s/it] 


Dynamic Programming training completed in 12738.24 seconds

Training Monte Carlo Agent...


Monte Carlo Training Progress: 100%|██████████| 1000/1000 [00:00<00:00, 1139.71it/s]


Monte Carlo training completed in 0.88 seconds

Evaluating Dynamic Programming Agent...
Dynamic Programming - Average Reward: -59.34, Average Steps: 63.59

Evaluating Monte Carlo Agent...
Monte Carlo - Average Reward: -96.17, Average Steps: 96.59

Comparison:
Training Time - DP: 12738.24s, MC: 0.88s
Average Reward - DP: -59.34, MC: -96.17
Average Steps - DP: 63.59, MC: 96.59
