In [1]:
# Parameters
rows = 5
cols = 5
start = "(0, 0)"
goal = "(4, 4)"
holes = ["(0, 3)", "(1, 0)", "(2, 2)", "(3, 0)", "(3, 2)"]
slippery = False
slip_prob = 0.2
gamma = 0.99
theta = 1e-08
map_desc = ["SFFHF", "HFFFF", "FFHFF", "HFHFF", "FFFFG"]
current_state = 11
current_row = 2
current_col = 1


In [2]:

# Parameters (papermill overrides these when run from the UI)
rows = 5
cols = 5
start = (0, 0)
goal = (4, 4)
holes = [(0, 3), (1, 0), (2, 2), (3, 0), (3, 2)]
slippery = False
slip_prob = 0.2
gamma = 0.99
theta = 1e-8
map_desc = None  # optional: UI may pass a list of strings

# Policy Iteration on FrozenLake-v1 (parameterized)
import numpy as np
import time

# Prefer gymnasium if present
try:
    import gymnasium as gym
except ImportError:
    import gym
import ast

# Normalize parameters if papermill injected strings

def _to_tuple_2(x):
    if isinstance(x, tuple) and len(x) == 2:
        return (int(x[0]), int(x[1]))
    if isinstance(x, str):
        t = ast.literal_eval(x)
        return (int(t[0]), int(t[1]))
    t = tuple(x)
    return (int(t[0]), int(t[1]))


def _to_tuple_list(x):
    if isinstance(x, str):
        v = ast.literal_eval(x)
    else:
        v = x
    return [tuple(map(int, t)) for t in v]

start = _to_tuple_2(start)
goal = _to_tuple_2(goal)
holes = _to_tuple_list(holes)

# Build desc only if not provided by UI

def build_desc(rows, cols, start, goal, holes):
    arr = np.full((rows, cols), 'F', dtype='<U1')
    sr, sc = start
    gr, gc = goal
    arr[sr, sc] = 'S'
    arr[gr, gc] = 'G'
    for hr, hc in holes:
        arr[hr, hc] = 'H'
    return ["".join(row) for row in arr]

if map_desc is None:
    map_desc = build_desc(rows, cols, start, goal, holes)

# Create env using the provided parameters. Note: slip_prob is not used by gym's FrozenLake.
env = gym.make(
    "FrozenLake-v1",
    is_slippery=bool(slippery),
    desc=map_desc,
    render_mode="human",
)
# Force starting state to provided start coordinates (gym normally starts at 'S')
start_index = start[0] * cols + start[1]
try:
    env.reset()
    env.unwrapped.s = start_index  # override internal state
except Exception as _e:
    pass
print(f"Start position set to: {start} (index {start_index})")
# Access MDP transitions and spaces
P = (env.unwrapped.P).copy()
nS = env.observation_space.n
nA = env.action_space.n

# Policy iteration components use the provided gamma/theta

def policy_evaluation(pi, V=None, gamma: float = gamma, theta: float = theta):
    if V is None:
        V = np.zeros(nS, dtype=np.float64)
    else:
        V = np.array(V, dtype=np.float64, copy=True)
    while True:
        delta = 0.0
        for s in range(nS):
            v_old = V[s]
            a = pi[s]
            v_new = 0.0
            for (prob, ns, r, done) in P[s][a]:
                v_new += prob * (r + gamma * (0.0 if done else V[ns]))
            V[s] = v_new
            delta = max(delta, abs(v_old - v_new))
        if delta < theta:
            break
    return V


def policy_improvement(V, gamma: float = gamma):
    pi = np.zeros(nS, dtype=int)
    for s in range(nS):
        q = np.zeros(nA, dtype=np.float64)
        for a in range(nA):
            for (prob, ns, r, done) in P[s][a]:
                q[a] += prob * (r + gamma * (0.0 if done else V[ns]))
        pi[s] = int(np.argmax(q))
    return pi


def policy_iteration(gamma: float = gamma, theta: float = theta):
    pi = np.random.randint(0, nA, size=nS, dtype=int)
    V = np.zeros(nS, dtype=np.float64)
    iteration = 0
    while True:
        iteration += 1
        V = policy_evaluation(pi, V, gamma=gamma, theta=theta)
        new_pi = policy_improvement(V, gamma=gamma)
        policy_stable = np.array_equal(pi, new_pi)
        pi = new_pi
        if policy_stable:
            break
    return pi, V, iteration


pi_opt, V_opt, iters = policy_iteration(gamma=gamma, theta=theta)


def run_episode(env, pi):
    obs, info = env.reset()
    terminated = False
    truncated = False
    total_reward = 0.0
    steps = 0
    while not (terminated or truncated):
        a = int(pi[obs])
        obs, r, terminated, truncated, info = env.step(a)
        total_reward += r
        steps += 1
    return total_reward, steps, terminated, truncated


total_reward, steps, terminated, truncated = run_episode(env, pi_opt)
print(f"Episode -> reward: {total_reward}, steps: {steps}, terminated: {terminated}, truncated: {truncated}")

print(f"Converged in {iters} iterations")
side = int(np.sqrt(nS))
print("Optimal V (reshaped if square):")
if side * side == nS:
    print(np.round(V_opt.reshape(side, side), 3))
else:
    print(np.round(V_opt, 3))

arrow_map = {0: "←", 1: "↓", 2: "→", 3: "↑"}
print("\nOptimal Policy:")
if side * side == nS:
    grid = np.array([arrow_map[a] for a in pi_opt]).reshape(side, side)
    for r in range(side):
        print(" ".join(grid[r]))
else:
    print(pi_opt)

# Keep window visible briefly and pump events to avoid "Not Responding"
try:
    import pygame
    for _ in range(100):  # ~3 seconds
        pygame.event.pump()
        time.sleep(0.03)
except Exception:
    time.sleep(3.0)

env.close()

  from pkg_resources import resource_stream, resource_exists


Start position set to: (0, 0) (index 0)


Episode -> reward: 1.0, steps: 8, terminated: True, truncated: False
Converged in 9 iterations
Optimal V (reshaped if square):
[[0.932 0.941 0.951 0.    0.97 ]
 [0.    0.951 0.961 0.97  0.98 ]
 [0.951 0.961 0.    0.98  0.99 ]
 [0.    0.97  0.    0.99  1.   ]
 [0.97  0.98  0.99  1.    0.   ]]

Optimal Policy:
→ ↓ ↓ ← ↓
← ↓ → ↓ ↓
→ ↓ ← ↓ ↓
← ↓ ← ↓ ↓
→ → → → ←
