## 计算序列回报

In [2]:
import numpy as np
np.random.seed(0)
# 定义状态转移概率矩阵 P
P = [
    [0.9, 0.1, 0.0, 0.0, 0.0, 0.0],
    [0.5, 0.0, 0.5, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.6, 0.0, 0.4],
    [0.0, 0.0, 0.0, 0.0, 0.3, 0.7],
    [0.0, 0.2, 0.3, 0.5, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
]
P = np.array(P)
rewards = [-1, -2, -2, 10, 1, 0] # 定义奖励函数
gamma = 0.5 # 定义折扣因子
# 给定一条序列，计算从某个索引（开始状态）开始到序列最后（终止状态）得到的回报

def compute_return(start_index, chain, gamma):
    G = 0
    for i in reversed(range(start_index, len(chain))):
        G = G*gamma + rewards[chain[i]-1]
    return G

# 一个状态序列，1-2-3-6
chain = [1,2,3,6]
start_index = 0
G = compute_return(start_index,chain,gamma)
print("回报: %s"% G)

回报: -2.5


## 价值函数计算

In [3]:
def compute_value_Bell(P, rewards, gamma, states_num):
    """
    使用Bellman‘s Equation 计算解析解
    """
    rewards = np.array(rewards).reshape((-1,1)) # rewards 转化为列向量的形式
    values = np.dot(np.linalg.inv(np.eye(states_num,states_num)-gamma*P),rewards)
    return values

V = compute_value_Bell(P,rewards,gamma,P.shape[0])
print("MRP 中每个状态的价值：\n",V)

MRP 中每个状态的价值：
 [[-2.01950168]
 [-2.21451846]
 [ 1.16142785]
 [10.53809283]
 [ 3.58728554]
 [ 0.        ]]
