In [None]:
rows = 5
cols = 5
start = [0, 0]
goal = [4, 4]
holes = [[0, 3], [1, 0], [2, 2], [3, 0], [3, 2]]
slippery = False
slip_prob = 0.2
gamma = 0.99
theta = 1e-08
map_desc = ['SFFHF', 'HFFFF', 'FFHFF', 'HFHFF', 'FFFFG']
current_state = 4
current_row = 0
current_col = 4

In [None]:
import numpy as np, time
try:
    import gymnasium as gym
except ImportError:
    import gym
# Ensure types
start = (int(start[0]), int(start[1]))
goal = (int(goal[0]), int(goal[1]))
holes = [(int(r), int(c)) for r,c in holes]
# Build desc if not provided
def build_desc(rows, cols, start, goal, holes):
    arr = np.full((rows, cols), 'F', dtype='<U1')
    sr, sc = start; gr, gc = goal
    arr[sr, sc] = 'S'; arr[gr, gc] = 'G'
    for hr, hc in holes: arr[hr, hc] = 'H'
    return [''.join(r) for r in arr]
if map_desc is None:
    map_desc = build_desc(rows, cols, start, goal, holes)
env = gym.make('FrozenLake-v1', is_slippery=bool(slippery), desc=map_desc, render_mode='human')
env.reset()
if current_state is not None:
    try: env.unwrapped.s = int(current_state)
    except Exception: pass
elif (current_row is not None) and (current_col is not None):
    try: env.unwrapped.s = int(current_row)*cols + int(current_col)
    except Exception: pass
else:
    try: env.unwrapped.s = start[0]*cols + start[1]
    except Exception: pass
P = env.unwrapped.P
nS = env.observation_space.n; nA = env.action_space.n
def policy_evaluation(pi, V=None, gamma=gamma, theta=theta):
    if V is None: V = np.zeros(nS, dtype=float)
    else: V = np.array(V, dtype=float, copy=True)
    while True:
        delta=0.0
        for s in range(nS):
            v_old=V[s]; a=pi[s]; v_new=0.0
            for (prob, ns, r, done) in P[s][a]:
                v_new += prob * (r + gamma * (0.0 if done else V[ns]))
            V[s]=v_new; delta=max(delta, abs(v_old-v_new))
        if delta<theta: break
    return V
def policy_improvement(V, gamma=gamma):
    pi = np.zeros(nS, dtype=int)
    for s in range(nS):
        q = np.zeros(nA, dtype=float)
        for a in range(nA):
            for (prob, ns, r, done) in P[s][a]:
                q[a] += prob * (r + gamma * (0.0 if done else V[ns]))
        pi[s] = int(np.argmax(q))
    return pi
def policy_iteration(gamma=gamma, theta=theta):
    pi = np.random.randint(0, nA, size=nS, dtype=int)
    V = np.zeros(nS, dtype=float)
    iters=0
    while True:
        iters+=1
        V = policy_evaluation(pi, V, gamma=gamma, theta=theta)
        new_pi = policy_improvement(V, gamma=gamma)
        if np.array_equal(pi, new_pi):
            pi=new_pi; break
        pi=new_pi
    return pi, V, iters
pi_opt, V_opt, iters = policy_iteration(gamma=gamma, theta=theta)
def run_episode(env, pi):
    obs, info = env.reset()
    if current_state is not None:
        try: env.unwrapped.s = int(current_state); obs = env.unwrapped.s
        except Exception: pass
    elif (current_row is not None) and (current_col is not None):
        try: env.unwrapped.s = int(current_row)*cols + int(current_col); obs = env.unwrapped.s
        except Exception: pass
    terminated = truncated = False; total = 0.0; steps = 0
    while not (terminated or truncated):
        a = int(pi[obs])
        obs, r, terminated, truncated, info = env.step(a)
        total += r; steps += 1
    return total, steps, terminated, truncated
total_reward, steps, terminated, truncated = run_episode(env, pi_opt)
print(f'Episode -> reward: {total_reward}, steps: {steps}, terminated: {terminated}, truncated: {truncated}')
print(f'Converged in {iters} iterations')
print('Optimal V (reshaped):')
try:
    gridV = V_opt.reshape(rows, cols)
    print(np.round(gridV, 3))
except Exception:
    print(np.round(V_opt, 3))
arrow_map = {0:'←',1:'↓',2:'→',3:'↑'}
print('
Optimal Policy:')
try:
    gridPi = np.array([arrow_map[a] for a in pi_opt]).reshape(rows, cols)
    for r in range(rows): print(' '.join(gridPi[r]))
except Exception:
    print(pi_opt)
# Keep window responsive briefly then close
try:
    import pygame
    for _ in range(80):
        pygame.event.pump(); time.sleep(0.03)
except Exception:
    time.sleep(2.0)
env.close()
