# Value Iteration

In [15]:
import numpy as np

GRID_WIDTH = 6
GRID_HEIGHT = 7
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']
REWARD_STEP = -1
REWARD_GOAL = 100
REWARD_TERMINAL = -100
GAMMA = 0.9

grid = np.array([
    [1, 1, 1, 1, 1, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 3, 0, 0, 0, 1],
    [1, 0, 0, 2, 0, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 1]
])

agent_start = (1, 2)
box_start = (4, 3)
storage_position = (3, 1)

V = np.zeros((GRID_HEIGHT, GRID_WIDTH, GRID_HEIGHT, GRID_WIDTH))

def is_valid(pos):
    x, y = pos
    return 0 <= x < GRID_HEIGHT and 0 <= y < GRID_WIDTH and grid[x][y] != 1

def move(pos, action):
    x, y = pos
    if action == 'UP':
        return (x - 1, y)
    elif action == 'DOWN':
        return (x + 1, y)
    elif action == 'LEFT':
        return (x, y - 1)
    elif action == 'RIGHT':
        return (x, y + 1)

def is_terminal(box_pos):
    return box_pos == storage_position

def transition(state, action):
    agent_pos, box_pos = state
    new_agent_pos = move(agent_pos, action)

    if new_agent_pos == box_pos:
        new_box_pos = move(box_pos, action)
        if is_valid(new_box_pos):
            reward = REWARD_GOAL if new_box_pos == storage_position else REWARD_STEP
            return (new_agent_pos, new_box_pos), reward
        else:
            return state, REWARD_TERMINAL
    elif is_valid(new_agent_pos):
        return (new_agent_pos, box_pos), REWARD_STEP
    return state, REWARD_TERMINAL

def value_iteration(max_iterations=1000, threshold=1e-3):
    for iteration in range(max_iterations):
        delta = 0
        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                for bi in range(GRID_HEIGHT):
                    for bj in range(GRID_WIDTH):
                        if grid[i][j] == 1 or grid[bi][bj] == 1:
                            continue
                        state = ((i, j), (bi, bj))
                        if is_terminal((bi, bj)):
                            continue
                        v = V[i, j, bi, bj]
                        best_value = float('-inf')
                        for action in ACTIONS:
                            next_state, reward = transition(state, action)
                            next_agent_pos, next_box_pos = next_state
                            next_i, next_j = next_agent_pos
                            next_bi, next_bj = next_box_pos
                            value = reward + GAMMA * V[next_i, next_j, next_bi, next_bj]
                            best_value = max(best_value, value)
                        V[i, j, bi, bj] = best_value
                        delta = max(delta, abs(v - V[i, j, bi, bj]))
        if delta < threshold:
            break
    return V

def get_best_action_and_value(state):
    best_action = None
    best_value = float('-inf')
    agent_pos, box_pos = state
    i, j = agent_pos
    bi, bj = box_pos
    for action in ACTIONS:
        next_state, reward = transition(state, action)
        next_agent_pos, next_box_pos = next_state
        next_i, next_j = next_agent_pos
        next_bi, next_bj = next_box_pos
        value = reward + GAMMA * V[next_i, next_j, next_bi, next_bj]
        if value > best_value:
            best_value = value
            best_action = action
    return best_action, best_value

def simulate():
    agent_pos = agent_start
    box_pos = box_start
    state = (agent_pos, box_pos)

    step_count = 0
    while not is_terminal(state[1]):
        step_count += 1
        best_action, best_value = get_best_action_and_value(state)
        print(f"Step {step_count}: State: Agent at {state[0]}, Box at {state[1]}, Value: {best_value}, Best Action: {best_action}")
        state, _ = transition(state, best_action)

optimal_value_function = value_iteration()
simulate()


Step 1: State: Agent at (1, 2), Box at (4, 3), Value: 32.61625379000001, Best Action: DOWN
Step 2: State: Agent at (2, 2), Box at (4, 3), Value: 37.35139310000001, Best Action: DOWN
Step 3: State: Agent at (3, 2), Box at (4, 3), Value: 42.61265900000001, Best Action: RIGHT
Step 4: State: Agent at (3, 3), Box at (4, 3), Value: 48.45851000000001, Best Action: RIGHT
Step 5: State: Agent at (3, 4), Box at (4, 3), Value: 54.95390000000001, Best Action: DOWN
Step 6: State: Agent at (4, 4), Box at (4, 3), Value: 62.171000000000014, Best Action: LEFT
Step 7: State: Agent at (4, 3), Box at (4, 2), Value: 70.19000000000001, Best Action: LEFT
Step 8: State: Agent at (4, 2), Box at (4, 1), Value: 79.10000000000001, Best Action: DOWN
Step 9: State: Agent at (5, 2), Box at (4, 1), Value: 89.0, Best Action: LEFT
Step 10: State: Agent at (5, 1), Box at (4, 1), Value: 100.0, Best Action: UP


# Monte-Carlo Simulation


In [16]:
import numpy as np
import random

GRID_WIDTH = 6
GRID_HEIGHT = 7
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']
REWARD_STEP = -1
REWARD_GOAL = 100
REWARD_TERMINAL = -100
GAMMA = 0.9
EPSILON = 0.2
MAX_STEPS_PER_EPISODE = 100

grid = np.array([
    [1, 1, 1, 1, 1, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 3, 0, 0, 0, 1],
    [1, 0, 0, 2, 0, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 1]
])

agent_start = (1, 2)
box_start = (4, 3)
storage_position = (3, 1)

Q = {}
returns_sum = {}
returns_count = {}

for i in range(GRID_HEIGHT):
    for j in range(GRID_WIDTH):
        for bi in range(GRID_HEIGHT):
            for bj in range(GRID_WIDTH):
                if grid[i][j] == 1 or grid[bi][bj] == 1:
                    continue
                state = ((i, j), (bi, bj))
                Q[state] = {action: 0 for action in ACTIONS}
                returns_sum[state] = {action: 0 for action in ACTIONS}
                returns_count[state] = {action: 0 for action in ACTIONS}

def is_valid(pos):
    x, y = pos
    return 0 <= x < GRID_HEIGHT and 0 <= y < GRID_WIDTH and grid[x][y] != 1

def move(pos, action):
    x, y = pos
    if action == 'UP':
        return (x - 1, y)
    elif action == 'DOWN':
        return (x + 1, y)
    elif action == 'LEFT':
        return (x, y - 1)
    elif action == 'RIGHT':
        return (x, y + 1)

def is_terminal(box_pos):
    return box_pos == storage_position

visited_states = set()

def transition_with_penalty(state, action):
    global visited_states
    agent_pos, box_pos = state
    new_agent_pos = move(agent_pos, action)

    if state in visited_states:
        return state, REWARD_STEP - 5

    visited_states.add(state)

    if new_agent_pos == box_pos:
        new_box_pos = move(box_pos, action)
        if is_valid(new_box_pos):
            reward = REWARD_GOAL if new_box_pos == storage_position else REWARD_STEP
            return (new_agent_pos, new_box_pos), reward
        else:
            return state, REWARD_TERMINAL
    elif is_valid(new_agent_pos):
        return (new_agent_pos, box_pos), REWARD_STEP
    return state, REWARD_TERMINAL

def epsilon_greedy_policy(state, epsilon=EPSILON):
    if random.uniform(0, 1) < epsilon:
        return random.choice(ACTIONS)
    else:
        agent_pos, box_pos = state
        action_values = Q[(agent_pos, box_pos)]
        return max(action_values, key=action_values.get)

def detect_and_break_loop(episode):
    recent_states = set()
    loop_counter = 0
    for step in episode:
        state, action, reward = step
        if state in recent_states:
            loop_counter += 1
        else:
            loop_counter = 0
            recent_states.clear()
        recent_states.add(state)
        if loop_counter >= 10:
            return True
    return False

def generate_episode_with_step_limit():
    episode = []
    state = (agent_start, box_start)
    steps = 0
    visited_states.clear()

    while not is_terminal(state[1]) and steps < MAX_STEPS_PER_EPISODE:
        action = epsilon_greedy_policy(state)
        next_state, reward = transition_with_penalty(state, action)
        episode.append((state, action, reward))
        state = next_state
        steps += 1

    return episode

def monte_carlo_first_visit(episodes=1000):
    for _ in range(episodes):
        episode = generate_episode_with_step_limit()
        if detect_and_break_loop(episode):
            print("Loop detected, breaking the loop")
            continue

        visited = set()
        G = 0

        for step in reversed(episode):
            state, action, reward = step
            G = reward + GAMMA * G
            if (state, action) not in visited:
                visited.add((state, action))
                returns_sum[state][action] += G
                returns_count[state][action] += 1
                Q[state][action] = returns_sum[state][action] / returns_count[state][action]

def simulate():
    agent_pos = agent_start
    box_pos = box_start
    state = (agent_pos, box_pos)

    step_count = 0
    while not is_terminal(state[1]) and step_count < MAX_STEPS_PER_EPISODE:
        step_count += 1
        action = epsilon_greedy_policy(state)
        next_state, _ = transition_with_penalty(state, action)
        print(f"Step {step_count}: State: Agent at {state[0]}, Box at {state[1]}, Best Action: {action}")
        state = next_state

monte_carlo_first_visit(episodes=1000)
simulate()


Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detected, breaking the loop
Loop detec