In [2]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0


In [3]:
import gymnasium as gym
import numpy as np
from gymnasium import spaces
from collections import defaultdict
from tqdm import tqdm
import time

class SokobanGame(gym.Env):
    def __init__(self):
        super(SokobanGame, self).__init__()

        self.height = 6
        self.width = 7

        self.action_space = spaces.Discrete(4)  # UP, DOWN, LEFT, RIGHT
        self.observation_space = spaces.Box(
            low=0, high=5, shape=(self.height, self.width), dtype=np.uint8
        )

        self.WALL = 0
        self.FLOOR = 1
        self.BOX = 2
        self.STORAGE = 3
        self.PLAYER = 4
        self.BOX_ON_STORAGE = 5

        self.actions = {
            0: (-1, 0),  # UP
            1: (1, 0),  # DOWN
            2: (0, -1),  # LEFT
            3: (0, 1),  # RIGHT
        }

        self.reset()

    def reset(self):
        self.grid = np.zeros((self.height, self.width), dtype=int)
        self.grid[1:-1, 1:-1] = self.FLOOR

        # Ensure there's at least one clear path
        self.grid[1:-1, 2] = self.FLOOR
        self.grid[2, 1:-1] = self.FLOOR

        # Randomly place player
        player_positions = list(zip(*np.where(self.grid == self.FLOOR)))
        self.player_pos = np.array(
            player_positions[np.random.choice(len(player_positions))]
        )
        self.grid[self.player_pos[0], self.player_pos[1]] = self.PLAYER

        # Randomly place box
        available_positions = list(zip(*np.where(self.grid == self.FLOOR)))
        self.box_pos = np.array(
            available_positions[np.random.choice(len(available_positions))]
        )
        self.grid[self.box_pos[0], self.box_pos[1]] = self.BOX

        # Randomly place storage
        available_positions = list(zip(*np.where(self.grid == self.FLOOR)))
        self.storage_pos = np.array(
            available_positions[np.random.choice(len(available_positions))]
        )
        self.grid[self.storage_pos[0], self.storage_pos[1]] = self.STORAGE

        return self.grid.copy(), {}

    def step(self, action):
        move = self.actions[action]
        new_pos = self.player_pos + move

        if self.grid[new_pos[0], new_pos[1]] == self.WALL:
            return self.grid.copy(), -1, False, False, {}

        if np.array_equal(new_pos, self.box_pos):
            new_box_pos = self.box_pos + move
            if self.grid[new_box_pos[0], new_box_pos[1]] in [self.WALL, self.BOX]:
                return self.grid.copy(), -1, False, False, {}
            self.grid[self.box_pos[0], self.box_pos[1]] = self.FLOOR
            self.box_pos = new_box_pos
            self.grid[new_box_pos[0], new_box_pos[1]] = (
                self.BOX
                if not np.array_equal(new_box_pos, self.storage_pos)
                else self.BOX_ON_STORAGE
            )

        self.grid[self.player_pos[0], self.player_pos[1]] = self.FLOOR
        self.player_pos = new_pos
        self.grid[new_pos[0], new_pos[1]] = self.PLAYER

        done = np.array_equal(self.box_pos, self.storage_pos)
        reward = 10 if done else -1

        return self.grid.copy(), reward, done, False, {}

    def get_current_state(self):
        return (tuple(self.player_pos), tuple(self.box_pos), tuple(self.storage_pos))

class DynamicProgramming:
    def __init__(self, environment):
        self.env = environment
        self.all_states = self._generate_state_space()
        self.state_index_map = {state: idx for idx, state in enumerate(self.all_states)}
        self.total_states = len(self.all_states)
        self.total_actions = environment.action_space.n
        self.value_function = np.zeros(self.total_states)
        self.policy = np.zeros(self.total_states, dtype=int)

    def _generate_state_space(self):
        state_space = []
        for player_pos in [(i, j) for i in range(self.env.height) for j in range(self.env.width)]:
            for box_pos in [(i, j) for i in range(self.env.height) for j in range(self.env.width)]:
                for storage_pos in [(i, j) for i in range(self.env.height) for j in range(self.env.width)]:
                    if (player_pos != box_pos and box_pos != storage_pos and player_pos != storage_pos):
                        state_space.append((player_pos, box_pos, storage_pos))
        return state_space

    def perform_value_iteration(self, gamma=0.9, convergence_threshold=1e-6, max_iter=1000):
        for _ in tqdm(range(max_iter), desc="Value Iteration"):
            max_diff = 0
            for idx, state in enumerate(self.all_states):
                old_value = self.value_function[idx]
                action_values = []
                for action in range(self.total_actions):
                    next_state, reward, done = self._calculate_next_state(state, action)
                    if next_state in self.state_index_map:
                        next_state_idx = self.state_index_map[next_state]
                        action_values.append(
                            reward + gamma * self.value_function[next_state_idx] * (not done)
                        )
                    else:
                        action_values.append(reward)
                self.value_function[idx] = max(action_values)
                max_diff = max(max_diff, abs(old_value - self.value_function[idx]))
            if max_diff < convergence_threshold:
                break

        self._update_policy(gamma)
        return self.policy

    def _calculate_next_state(self, state, action):
        self.env.reset()
        self.env.player_pos, self.env.box_pos, self.env.storage_pos = state
        self.env.grid[self.env.player_pos[0], self.env.player_pos[1]] = self.env.PLAYER
        self.env.grid[self.env.box_pos[0], self.env.box_pos[1]] = self.env.BOX
        self.env.grid[self.env.storage_pos[0], self.env.storage_pos[1]] = self.env.STORAGE

        next_state_obs, reward, done, _, _ = self.env.step(action)
        next_state = self.env.get_current_state()

        return next_state, reward, done

    def _update_policy(self, gamma):
        for idx, state in enumerate(self.all_states):
            action_values = []
            for action in range(self.total_actions):
                next_state, reward, done = self._calculate_next_state(state, action)
                if next_state in self.state_index_map:
                    next_state_idx = self.state_index_map[next_state]
                    action_values.append(
                        reward + gamma * self.value_function[next_state_idx] * (not done)
                    )
                else:
                    action_values.append(reward)
            self.policy[idx] = np.argmax(action_values)

    def select_action(self, state):
        if state in self.state_index_map:
            return self.policy[self.state_index_map[state]]
        return np.random.randint(self.total_actions)

class MonteCarlo:
    def __init__(self, environment):
        self.env = environment
        self.q_values = defaultdict(lambda: np.zeros(environment.action_space.n))
        self.state_action_returns = defaultdict(lambda: defaultdict(list))
        self.policy = {}
        self.initial_epsilon = 1.0
        self.min_epsilon = 0.01
        self.epsilon_decay_rate = 0.95
        self.discount_factor = 0.9  # Gamma (discount factor)

    def epsilon_greedy(self, state, epsilon):
        if np.random.random() < epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.q_values[state]) if state in self.q_values else self.env.action_space.sample()

    def create_episode(self, epsilon):
        episode = []
        state, _ = self.env.reset()
        done = False
        step_count = 0
        while not done and step_count < 100:  # Limiting episode length
            state = self.env.get_current_state()
            action = self.epsilon_greedy(state, epsilon)
            next_state, reward, done, _, _ = self.env.step(action)
            episode.append((state, action, reward))
            step_count += 1
        return episode

    def learn(self, num_episodes=10000):
        epsilon = self.initial_epsilon
        progress_bar = tqdm(range(num_episodes), desc="Monte Carlo Training")
        for _ in progress_bar:
            episode = self.create_episode(epsilon)
            G = 0
            for t in range(len(episode) - 1, -1, -1):
                state, action, reward = episode[t]
                G = self.discount_factor * G + reward
                if (state, action) not in [(x[0], x[1]) for x in episode[:t]]:
                    self.state_action_returns[state][action].append(G)
                    self.q_values[state][action] = np.mean(self.state_action_returns[state][action])
                    self.policy[state] = np.argmax(self.q_values[state])

            # Decay epsilon
            epsilon = max(self.min_epsilon, epsilon * self.epsilon_decay_rate)

            # Update progress bar
            progress_bar.set_postfix({"epsilon": f"{epsilon:.4f}"})

        return self.policy

    def select_action(self, state):
        return self.policy.get(state, self.env.action_space.sample())

def run_episode(env, policy, save_frames=False):
    state, _ = env.reset()
    total_reward = 0
    done = False
    steps = 0
    frames = []
    while not done and steps < 100:
        action = policy.select_action(env.get_current_state())
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        steps += 1
        if save_frames:
            frames.append(state)
    return total_reward, steps, frames

def evaluate_policy(env, policy, num_episodes=100):
    total_rewards = []
    total_steps = []
    for _ in range(num_episodes):
        reward, steps, _ = run_episode(env, policy)
        total_rewards.append(reward)
        total_steps.append(steps)
    return np.mean(total_rewards), np.mean(total_steps)

def main():
    env = SokobanGame()

    print("Initializing and training DP solver... \n progress bar might not move, let it run!")
    start_time = time.time()
    dp_solver = DynamicProgramming(env)
    dp_solver.perform_value_iteration()
    dp_training_time = time.time() - start_time
    print(f"DP training completed in {dp_training_time:.2f} seconds")

    print("\nEvaluating DP policy...")
    dp_avg_reward, dp_avg_steps = evaluate_policy(env, dp_solver)
    print(f"DP Average Reward: {dp_avg_reward:.2f}")
    print(f"DP Average Steps: {dp_avg_steps:.2f}")

    print("\nInitializing and training MC solver...")
    start_time = time.time()
    mc_solver = MonteCarlo(env)
    mc_solver.learn(num_episodes=10000)
    mc_training_time = time.time() - start_time
    print(f"MC training completed in {mc_training_time:.2f} seconds")

    print("\nEvaluating MC policy...")
    mc_avg_reward, mc_avg_steps = evaluate_policy(env, mc_solver)
    print(f"MC Average Reward: {mc_avg_reward:.2f}")
    print(f"MC Average Steps: {mc_avg_steps:.2f}")

    print("\nComparing:")
    print(f"Training Time - DP: {dp_training_time:.2f}s, MC: {mc_training_time:.2f}s")
    print(f"Average Reward - DP: {dp_avg_reward:.2f}, MC: {mc_avg_reward:.2f}")
    print(f"Average Steps - DP: {dp_avg_steps:.2f}, MC: {mc_avg_steps:.2f}")

if __name__ == "__main__":
    main()

Initializing and training DP solver... 
 progress bar might not move, let it run!


Value Iteration:   0%|          | 1/1000 [01:15<20:57:39, 75.53s/it]


DP training completed in 106.92 seconds

Evaluating DP policy...
DP Average Reward: -100.00
DP Average Steps: 100.00

Initializing and training MC solver...


Monte Carlo Training: 100%|██████████| 10000/10000 [01:22<00:00, 121.59it/s, epsilon=0.0100]


MC training completed in 82.26 seconds

Evaluating MC policy...
MC Average Reward: -84.22
MC Average Steps: 85.87

Comparing:
Training Time - DP: 106.92s, MC: 82.26s
Average Reward - DP: -100.00, MC: -84.22
Average Steps - DP: 100.00, MC: 85.87
