In [21]:
import numpy as np
from collections import defaultdict
import random

In [25]:
# Environment Setup
# 0: empty, 1: wall, 2: box, 3: storage, 4: agent
grid = np.array([
    [1, 1, 1, 1, 1, 1],
    [1, 0, 4, 1, 1, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 3, 0, 0, 0, 1],
    [1, 0, 0, 2, 0, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 1],
])

# State is a tuple: (agent_position, box_positions)
initial_agent_pos = (1, 2)
initial_box_pos = [(4, 3)]
storage_pos = [(3, 1)]

# Constants
GRID_ROWS = 7
GRID_COLS = 6
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']
DISCOUNT_FACTOR = 0.9
THRESHOLD = 1e-4
EPISODES = 1000

# Movement directions for each action
action_to_delta = {
    'UP': (-1, 0),
    'DOWN': (1, 0),
    'LEFT': (0, -1),
    'RIGHT': (0, 1)
}

In [27]:
# Initialize Q-value table and returns
Q = defaultdict(lambda: {action: 0.0 for action in ACTIONS})
returns = defaultdict(lambda: {action: [] for action in ACTIONS})

def generate_episode(grid, agent_pos, box_pos, storage_pos):
    episode = []
    state = (agent_pos, tuple(box_pos))
    while not is_terminal(state, storage_pos):
        action = random.choice(ACTIONS)
        next_state, reward = step(state, action, grid, storage_pos)
        episode.append((state, action, reward))
        state = next_state
    return episode

# Monte Carlo First-Vist

In [None]:
def first_visit_mc(grid, agent_pos, box_pos, storage_pos):
    for _ in range(EPISODES):
        episode = generate_episode(grid, agent_pos, box_pos, storage_pos)
        visited_state_action_pairs = set()
        G = 0

        for state, action, reward in reversed(episode):
            G = reward + DISCOUNT * G
            if (state, action) not in visited_state_action_pairs:
                visited_state_action_pairs.add((state, action))
                returns[state][action].append(G)
                Q[state][action] = np.mean(returns[state][action])

def step(state, action, grid, storage_pos):
    agent_pos, box_positions = state
    delta = action_to_delta[action]

    # Calculate new position for the agent
    new_agent_pos = (agent_pos[0] + delta[0], agent_pos[1] + delta[1])

    # Check if new position is valid (i.e., not a wall or out of bounds)
    if grid[new_agent_pos[0], new_agent_pos[1]] == 1:
        return state, -1  # Invalid move (into a wall)

    # Check if new position has a box
    new_box_positions = list(box_positions)
    if new_agent_pos in box_positions:
        # Calculate new position for the box
        new_box_pos = (new_agent_pos[0] + delta[0], new_agent_pos[1] + delta[1])

        # Check if the box can move (i.e., not blocked by wall or another box)
        if grid[new_box_pos[0], new_box_pos[1]] == 1 or new_box_pos in box_positions:
            return state, -1  # Invalid move (box cannot move)
        else:
            # Move the box
            new_box_positions[box_positions.index(new_agent_pos)] = new_box_pos

    # Return new state and reward
    if all(box in storage_pos for box in new_box_positions):
        return (new_agent_pos, tuple(new_box_positions)), 1  # All boxes in storage: terminal state
    else:
        return (new_agent_pos, tuple(new_box_positions)), 0  # Valid move, continue

def is_terminal(state, storage_pos):
    _, box_positions = state
    return all(box in storage_pos for box in box_positions)

# Call First-Visit Monte Carlo Control
first_visit_mc(grid, initial_agent_pos, initial_box_pos, storage_pos)

# Monte Carlo Every-Visit

In [None]:
def every_visit_mc(grid, agent_pos, box_pos, storage_pos):
    for _ in range(EPISODES):
        episode = generate_episode(grid, agent_pos, box_pos, storage_pos)
        G = 0

        for state, action, reward in reversed(episode):
            G = reward + DISCOUNT * G
            returns[state][action].append(G)
            Q[state][action] = np.mean(returns[state][action])

def step(state, action, grid, storage_pos):
    agent_pos, box_positions = state
    delta = action_to_delta[action]

    # Calculate new position for the agent
    new_agent_pos = (agent_pos[0] + delta[0], agent_pos[1] + delta[1])

    # Check if new position is valid (i.e., not a wall or out of bounds)
    if grid[new_agent_pos[0], new_agent_pos[1]] == 1:
        return state, -1  # Invalid move (into a wall)

    # Check if new position has a box
    new_box_positions = list(box_positions)
    if new_agent_pos in box_positions:
        # Calculate new position for the box
        new_box_pos = (new_agent_pos[0] + delta[0], new_agent_pos[1] + delta[1])

        # Check if the box can move (i.e., not blocked by wall or another box)
        if grid[new_box_pos[0], new_box_pos[1]] == 1 or new_box_pos in box_positions:
            return state, -1  # Invalid move (box cannot move)
        else:
            # Move the box
            new_box_positions[box_positions.index(new_agent_pos)] = new_box_pos

    # Return new state and reward
    if all(box in storage_pos for box in new_box_positions):
        return (new_agent_pos, tuple(new_box_positions)), 1  # All boxes in storage: terminal state
    else:
        return (new_agent_pos, tuple(new_box_positions)), 0  # Valid move, continue

def is_terminal(state, storage_pos):
    _, box_positions = state
    return all(box in storage_pos for box in box_positions)

# Call Every-Visit Monte Carlo Control
every_visit_mc(grid, initial_agent_pos, initial_box_pos, storage_pos)