In [9]:
import numpy as np

# 定義冰湖遊戲的狀態空間和行動空間
n_states = 16
n_actions = 4
# S: start, F: frozen surface, H: hole, G: goal
desc = [
    'SFFF',
    'FHFH',
    'FFFH',
    'HFFG'
]

# 定義回報函數
def reward(state):
    if state == n_states - 1:
        return 1
    else:
        return 0

# 初始化狀態值函數和策略
V = np.zeros(n_states)
policy = np.zeros(n_states)

# 定義轉移概率矩陣和回報矩陣
P = np.zeros((n_states, n_actions, n_states))
R = np.zeros((n_states, n_actions, n_states))
for i in range(n_states):
    row, col = i // 4, i % 4
    for j in range(n_actions):
        if desc[row][col] == 'H':
            P[i, j, i] = 1
            R[i, j, i] = reward(i)
        else:
            if j == 0: # up
                if row == 0:
                    next_state = i
                else:
                    next_state = (row-1)*4 + col
            elif j == 1: # down
                if row == 3:
                    next_state = i
                else:
                    next_state = (row+1)*4 + col
            elif j == 2: # left
                if col == 0:
                    next_state = i
                else:
                    next_state = row*4 + col-1
            elif j == 3: # right
                if col == 3:
                    next_state = i
                else:
                    next_state = row*4 + col+1
            P[i, j, next_state] = 1
            R[i, j, next_state] = reward(next_state)

# 定義折扣因子
gamma = 0.9

# 迭代更新狀態值函數和策略
for _ in range(1000):
    V_new = np.zeros(n_states)
    for s in range(n_states):
        Q = np.zeros(n_actions)
        for a in range(n_actions):
            Q[a] = P[s, a, :] @ (R[s, a, :] + gamma * V)
        V_new[s] = np.max(Q)
        policy[s] = np.argmax(Q)
    if np.max(np.abs(V_new - V)) < 1e-6:
        break
    V = V_new

# 輸出狀態值函數和策略
print(V.reshape((4, 4)))
print(policy.reshape((4, 4)))


[[5.90489088 6.56099088 7.28999088 6.56099088]
 [6.56099088 0.         8.09999088 0.        ]
 [7.28999088 8.09999088 8.99999088 0.        ]
 [0.         8.99999088 9.99999088 9.99999088]]
[[1. 3. 1. 2.]
 [1. 0. 1. 0.]
 [3. 1. 1. 0.]
 [0. 3. 3. 1.]]
