## 方法一

In [35]:
import numpy as np

In [36]:
#Ignore warning
import warnings
warnings.filterwarnings('ignore')

In [37]:
import gymnasium as gym

In [45]:
import numpy as np

# 定義冰湖遊戲的狀態空間和行動空間
n_states = 16
n_actions = 4
# S: start, F: frozen surface, H: hole, G: goal
desc = [
    'SFFF',
    'FHFH',
    'FFFH',
    'HFFG'
]

# 定義回報函數
def reward(state):
    if state == n_states - 1:
        return 1
    else:
        return 0

# 初始化狀態值函數
V = np.zeros(n_states)

# 定義轉移概率矩陣
P = np.zeros((n_states, n_actions, n_states))
for i in range(n_states):
    row, col = i // 4, i % 4
    for j in range(n_actions):
        if desc[row][col] == 'H':
            P[i, j, i] = 1
        else:
            if j == 0: # up
                if row == 0:
                    next_state = i
                else:
                    next_state = (row-1)*4 + col
            elif j == 1: # down
                if row == 3:
                    next_state = i
                else:
                    next_state = (row+1)*4 + col
            elif j == 2: # left
                if col == 0:
                    next_state = i
                else:
                    next_state = row*4 + col-1
            elif j == 3: # right
                if col == 3:
                    next_state = i
                else:
                    next_state = row*4 + col+1
            P[i, j, next_state] = 1

# 定義折扣因子
gamma = 0.9

# 迭代更新狀態值函數
for _ in range(1000):
    V_new = np.zeros(n_states)
    for s in range(n_states):
        for a in range(n_actions):
            V_new[s] += P[s, a, :] @ (reward(s) + gamma * V)
    if np.max(np.abs(V_new - V)) < 1e-6:
        break
    V = V_new

# 輸出狀態值函數
print(V.reshape((4, 4)))


[[nan nan nan nan]
 [nan nan nan nan]
 [nan nan nan nan]
 [nan nan nan nan]]


## 方法二

In [39]:
import gymnasium as gym
import numpy as np

# 設置遊戲環境
env = gym.make('FrozenLake-v1')
env = env.unwrapped

# 將 env.P[s] 轉換為列表
# for s in range(env.observation_space.n):
#     env.P[s] = {a: [p] for a, p in env.P[s].items()}

for s in range(env.observation_space.n):
    env.P[s] = [env.P[s][a] for a in range(env.action_space.n)]
    
# 狀態值函數初始化為0
V = np.zeros(env.observation_space.n)

# 狀態值函數更新函數
def update_V(env, V, s, gamma=1.0):
    v = 0
    # 計算從狀態s開始的所有可能的下一個狀態的期望值
    for a, p in enumerate(env.P[s]):
        for prob, next_state, reward, done in p:
            v += prob * (reward + gamma * V[next_state])
    return v

# 迭代更新狀態值函數
for i in range(1000):
    delta = 0
    for s in range(env.observation_space.n):
        v = V[s]
        # 更新狀態值函數
        V[s] = update_V(env, V, s)
        delta = max(delta, abs(v - V[s]))
    # 如果狀態值函數收斂，則停止更新
    if delta < 1e-8:
        break

# 打印最終的狀態值函數
print("Final State-Value Function:")
print(V.reshape((4, 4)))


Final State-Value Function:
[[inf inf inf inf]
 [inf  0. inf  0.]
 [inf inf inf  0.]
 [ 0. inf inf  0.]]


## 方法三 (改自方法一)

In [46]:
env = gym.make('FrozenLake-v1')

n_states = env.observation_space.n
n_actions = env.action_space.n

# 初始化狀態值函數，初始值都設為 0.0
V = np.zeros(n_states) + 0.0

# 迭代更新狀態值函數
for i in range(1000):
    # 計算新的狀態值函數
    new_V = np.zeros(n_states) + 0.0
    for s in range(n_states):
        for a in range(n_actions):
            transitions = env.P[s][a]
            for p, s_next, r, done in transitions:
                new_V[s] += p * (r + 0.99 * V[s_next])
    # 檢查狀態值函數是否收斂
    if np.max(np.abs(V - new_V)) < 1e-6:
        break
    # 更新狀態值函數
    V = new_V

# 輸出狀態值函數
print(V.reshape(4, 4))


[[inf inf inf inf]
 [inf  0. inf  0.]
 [inf inf inf  0.]
 [ 0. inf inf  0.]]
