IMPLEMENTING DYNAMIC PROGRAMMING - VALUE ITERATION

In [1]:
import numpy as np

WIDTH = 6
HEIGHT = 7
ACTIONS_LIST = ['UP', 'DOWN', 'LEFT', 'RIGHT']
STEP_REWARD = -1
GOAL_REWARD = 100
TERMINAL_REWARD = -100
DISCOUNT = 0.9

layout = np.array([
    [1, 1, 1, 1, 1, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 3, 0, 0, 0, 1],
    [1, 0, 0, 2, 0, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 1]
])

start_agent = (1, 2)
start_box = (4, 3)
storage_spot = (3, 1)

values = np.zeros((HEIGHT, WIDTH, HEIGHT, WIDTH))

def valid(pos):
    x, y = pos
    return 0 <= x < HEIGHT and 0 <= y < WIDTH and layout[x][y] != 1

def make_move(pos, action):
    x, y = pos
    if action == 'UP':
        return (x - 1, y)
    elif action == 'DOWN':
        return (x + 1, y)
    elif action == 'LEFT':
        return (x, y - 1)
    elif action == 'RIGHT':
        return (x, y + 1)

def check_terminal(box_pos):
    return box_pos == storage_spot

def perform_transition(state, action):
    agent, box = state
    next_agent = make_move(agent, action)

    if next_agent == box:
        next_box = make_move(box, action)
        if valid(next_box):
            reward = GOAL_REWARD if next_box == storage_spot else STEP_REWARD
            return (next_agent, next_box), reward
        else:
            return state, TERMINAL_REWARD
    elif valid(next_agent):
        return (next_agent, box), STEP_REWARD
    return state, TERMINAL_REWARD

def run_value_iteration(max_iter=1000, tol=1e-3):
    for _ in range(max_iter):
        delta = 0
        for i in range(HEIGHT):
            for j in range(WIDTH):
                for bi in range(HEIGHT):
                    for bj in range(WIDTH):
                        if layout[i][j] == 1 or layout[bi][bj] == 1:
                            continue
                        state = ((i, j), (bi, bj))
                        if check_terminal((bi, bj)):
                            continue
                        old_value = values[i, j, bi, bj]
                        best_val = float('-inf')
                        for action in ACTIONS_LIST:
                            next_state, reward = perform_transition(state, action)
                            next_i, next_j = next_state[0]
                            next_bi, next_bj = next_state[1]
                            current_val = reward + DISCOUNT * values[next_i, next_j, next_bi, next_bj]
                            best_val = max(best_val, current_val)
                        values[i, j, bi, bj] = best_val
                        delta = max(delta, abs(old_value - values[i, j, bi, bj]))
        if delta < tol:
            break
    return values

def find_best_action(state):
    optimal_action = None
    optimal_value = float('-inf')
    i, j = state[0]
    bi, bj = state[1]
    for action in ACTIONS_LIST:
        next_state, reward = perform_transition(state, action)
        next_i, next_j = next_state[0]
        next_bi, next_bj = next_state[1]
        value = reward + DISCOUNT * values[next_i, next_j, next_bi, next_bj]
        if value > optimal_value:
            optimal_value = value
            optimal_action = action
    return optimal_action, optimal_value

def run_simulation():
    agent = start_agent
    box = start_box
    state = (agent, box)
    step = 0
    while not check_terminal(state[1]):
        step += 1
        action, val = find_best_action(state)
        print(f"Step {step}: Agent at {state[0]}, Box at {state[1]}, Value: {val}, Best Action: {action}")
        state, _ = perform_transition(state, action)

final_values = run_value_iteration()
run_simulation()


Step 1: Agent at (1, 2), Box at (4, 3), Value: 32.61625379000001, Best Action: DOWN
Step 2: Agent at (2, 2), Box at (4, 3), Value: 37.35139310000001, Best Action: DOWN
Step 3: Agent at (3, 2), Box at (4, 3), Value: 42.61265900000001, Best Action: RIGHT
Step 4: Agent at (3, 3), Box at (4, 3), Value: 48.45851000000001, Best Action: RIGHT
Step 5: Agent at (3, 4), Box at (4, 3), Value: 54.95390000000001, Best Action: DOWN
Step 6: Agent at (4, 4), Box at (4, 3), Value: 62.171000000000014, Best Action: LEFT
Step 7: Agent at (4, 3), Box at (4, 2), Value: 70.19000000000001, Best Action: LEFT
Step 8: Agent at (4, 2), Box at (4, 1), Value: 79.10000000000001, Best Action: DOWN
Step 9: Agent at (5, 2), Box at (4, 1), Value: 89.0, Best Action: LEFT
Step 10: Agent at (5, 1), Box at (4, 1), Value: 100.0, Best Action: UP


IMPLEMENTING MONTE CARLO

In [3]:
import numpy as np
import random

GRID_WIDTH, GRID_HEIGHT = 6, 7
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']
REWARD_STEP, REWARD_GOAL, REWARD_TERMINAL = -1, 100, -100
GAMMA, EPSILON = 0.9, 0.2
MAX_STEPS_PER_EPISODE = 100

grid = np.array([
    [1, 1, 1, 1, 1, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 3, 0, 0, 0, 1],
    [1, 0, 0, 2, 0, 1],
    [1, 0, 0, 1, 1, 1],
    [1, 1, 1, 1, 1, 1]
])

agent_start, box_start = (1, 2), (4, 3)
storage_position = (3, 1)

Q, returns_sum, returns_count = {}, {}, {}

for i in range(GRID_HEIGHT):
    for j in range(GRID_WIDTH):
        for bi in range(GRID_HEIGHT):
            for bj in range(GRID_WIDTH):
                if grid[i][j] == 1 or grid[bi][bj] == 1:
                    continue
                state = ((i, j), (bi, bj))
                Q[state] = {action: 0 for action in ACTIONS}
                returns_sum[state] = {action: 0 for action in ACTIONS}
                returns_count[state] = {action: 0 for action in ACTIONS}

def is_valid(pos):
    return 0 <= pos[0] < GRID_HEIGHT and 0 <= pos[1] < GRID_WIDTH and grid[pos[0]][pos[1]] != 1

def move(pos, action):
    if action == 'UP':
        return (pos[0] - 1, pos[1])
    if action == 'DOWN':
        return (pos[0] + 1, pos[1])
    if action == 'LEFT':
        return (pos[0], pos[1] - 1)
    return (pos[0], pos[1] + 1)

def is_terminal(box_pos):
    return box_pos == storage_position

visited_states = set()

def transition_with_penalty(state, action):
    global visited_states
    agent_pos, box_pos = state
    new_agent_pos = move(agent_pos, action)

    if state in visited_states:
        return state, REWARD_STEP - 5

    visited_states.add(state)

    if new_agent_pos == box_pos:
        new_box_pos = move(box_pos, action)
        if is_valid(new_box_pos):
            return (new_agent_pos, new_box_pos), REWARD_GOAL if new_box_pos == storage_position else REWARD_STEP
        return state, REWARD_TERMINAL
    if is_valid(new_agent_pos):
        return (new_agent_pos, box_pos), REWARD_STEP
    return state, REWARD_TERMINAL

def epsilon_greedy_policy(state):
    if random.uniform(0, 1) < EPSILON:
        return random.choice(ACTIONS)
    return max(Q[state], key=Q[state].get)

def detect_and_break_loop(episode):
    recent_states = set()
    loop_counter = 0
    for state, action, reward in episode:
        if state in recent_states:
            loop_counter += 1
        else:
            loop_counter = 0
            recent_states.clear()
        recent_states.add(state)
        if loop_counter >= 10:
            return True
    return False

def generate_episode_with_step_limit():
    episode, state, steps = [], (agent_start, box_start), 0
    visited_states.clear()

    while not is_terminal(state[1]) and steps < MAX_STEPS_PER_EPISODE:
        action = epsilon_greedy_policy(state)
        next_state, reward = transition_with_penalty(state, action)
        episode.append((state, action, reward))
        state, steps = next_state, steps + 1

    return episode

def monte_carlo_first_visit(episodes=1000):
    for _ in range(episodes):
        episode = generate_episode_with_step_limit()
        if detect_and_break_loop(episode):
            continue

        visited, G = set(), 0

        for step in reversed(episode):
            state, action, reward = step
            G = reward + GAMMA * G
            if (state, action) not in visited:
                visited.add((state, action))
                returns_sum[state][action] += G
                returns_count[state][action] += 1
                Q[state][action] = returns_sum[state][action] / returns_count[state][action]

def simulate():
    state = (agent_start, box_start)
    step_count = 0

    while not is_terminal(state[1]) and step_count < MAX_STEPS_PER_EPISODE:
        step_count += 1
        action = epsilon_greedy_policy(state)
        next_state, _ = transition_with_penalty(state, action)
        print(f"Step {step_count}: State: Agent at {state[0]}, Box at {state[1]}, Best Action: {action}")
        state = next_state

monte_carlo_first_visit(episodes=1000)
simulate()


Step 1: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 2: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 3: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 4: State: Agent at (1, 2), Box at (4, 3), Best Action: LEFT
Step 5: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 6: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 7: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 8: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 9: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 10: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 11: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 12: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 13: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 14: State: Agent at (1, 2), Box at (4, 3), Best Action: UP
Step 15: State: Agent at (1, 2), Box at (4, 3), Best Action: RIGHT
Step 16: State: Agent at (1, 2), Box at (4, 