In [None]:
# pip install pymdptoolbox numpy
import numpy as np
import mdptoolbox

# ========== 参数 ==========
max_tries = 7
target_atk = 14
A_max = 30          # 攻击状态上限（防越界，取比目标稍大）
gamma = 1.0         # 最大化成功概率
reward_success = 1.0

# 四个动作：A5, A2, B5, B2 及其成功率、加成、失败时是否带消失风险
# 格式: (success_prob, delta_on_success, fail_destroy_prob)
# 失败不加成（+0）；若 fail_destroy_prob > 0，则失败后以该概率进入 GONE，否则留在 (t+1, atk)
ACTIONS = [
    ("A5", 0.11, 3, 0.0),  # 成功10% +5；失败不消失
    ("A2", 0.66, 2, 0.0),  # 成功60% +2；失败不消失
    ("B5", 0.33, 3, 0.5),  # 成功30% +5；失败时50%消失
    ("B2", 0.77, 2, 0.5),  # 成功70% +2；失败时50%消失
]
nA = len(ACTIONS)

# ========== 状态编码 ==========
id_map = {}
states = []
for t in range(max_tries + 1):
    for atk in range(A_max + 1):
        sid = len(id_map)
        id_map[(t, atk)] = sid
        states.append((t, atk))

S_goal = len(id_map); id_map["GOAL"] = S_goal
S_gone = S_goal + 1;  id_map["GONE"] = S_gone
S_dead = S_goal + 2;  id_map["DEAD"] = S_dead

nS = len(id_map)

def clamp_atk(a):
    return min(a, A_max)

# ========== 初始化 P, R ==========
P = [np.zeros((nS, nS)) for _ in range(nA)]
R = [np.zeros((nS, nS)) for _ in range(nA)]

def add_transition(a_idx, s_from, s_to, prob, reward):
    if prob <= 1e-15:  # 避免浮点噪声
        return
    P[a_idx][s_from, s_to] += prob
    # 仅在到达 GOAL 的转移上给奖励 1
    if reward != 0.0:
        R[a_idx][s_from, s_to] = reward

for t in range(max_tries + 1):
    for atk in range(A_max + 1):
        s = id_map[(t, atk)]

        # 若已达标，吸收到 GOAL
        if atk >= target_atk:
            for a_idx in range(nA):
                add_transition(a_idx, s, S_goal, 1.0, 0.0)
            continue

        # 若次数用尽且未达标，吸收到 DEAD
        if t >= max_tries:
            for a_idx in range(nA):
                add_transition(a_idx, s, S_dead, 1.0, 0.0)
            continue

        t_next = t + 1

        # 为每个动作构造转移
        for a_idx, (name, p_succ, delta, fail_destroy_prob) in enumerate(ACTIONS):
            p_fail = 1.0 - p_succ

            # 成功分支：atk 增加 delta；若达标则进入 GOAL，否则到 (t+1, atk+delta)
            atk_succ = clamp_atk(atk + delta)
            if atk_succ >= target_atk:
                add_transition(a_idx, s, S_goal, p_succ, reward_success)
            else:
                s_next = id_map[(t_next, atk_succ)]
                add_transition(a_idx, s, s_next, p_succ, 0.0)

            # 失败分支：+0
            if p_fail > 0:
                # 部分失败直接消失
                if fail_destroy_prob > 0:
                    add_transition(a_idx, s, S_gone, p_fail * fail_destroy_prob, 0.0)
                    p_fail_keep = p_fail * (1.0 - fail_destroy_prob)
                else:
                    p_fail_keep = p_fail

                # 失败但未消失：停留在攻击不变、次数+1
                if p_fail_keep > 0:
                    s_fail_keep = id_map[(t_next, clamp_atk(atk))]
                    add_transition(a_idx, s, s_fail_keep, p_fail_keep, 0.0)

# 终止态吸收
for a_idx in range(nA):
    P[a_idx][S_goal, S_goal] = 1.0
    P[a_idx][S_gone, S_gone] = 1.0
    P[a_idx][S_dead, S_dead] = 1.0
    # 终止奖励默认保持 0（除了到达 GOAL 的那一下已经给了 1）

In [4]:
# ========== 值迭代求解 ==========
vi = mdptoolbox.mdp.ValueIteration(P, R, discount=gamma)
vi.run()

V = vi.V            # 状态的最大成功概率
policy = vi.policy  # 最优动作：0=A5, 1=A2, 2=B5, 3=B2

# 初始状态的结果
s0 = id_map[(0, 0)]
print("Initial best action (0=A5,1=A2,2=B5,3=B2):", policy[s0])
print("Initial success probability:", V[s0])

# 查看某些中间状态的最优动作
def best_action_at(t, atk):
    return policy[id_map[(t, atk)]]

probes = [(0,0), (1,0), (2,2), (3,4), (4,6), (5,10), (6,12)]
for t_probe, atk_probe in probes:
    print(f"(t={t_probe}, atk={atk_probe}) best action:", best_action_at(t_probe, atk_probe))

Initial best action (0=A5,1=A2,2=B5,3=B2): 1
Initial success probability: 0.19977434999999996
(t=0, atk=0) best action: 1
(t=1, atk=0) best action: 2
(t=2, atk=2) best action: 2
(t=3, atk=4) best action: 2
(t=4, atk=6) best action: 2
(t=5, atk=10) best action: 1
(t=6, atk=12) best action: 3


In [8]:
2**7

128

In [None]:
states = []
possible_scrolls_results = [3, 2, 0]
num_slots = 3
for slot_i in range(num_slots):
    all_possible_states = [3, 2, 0, -1]
    states.append(all_possible_states)



## Example

In [13]:
P.shape

(2, 3, 3)

In [11]:
import mdptoolbox, mdptoolbox.example

P, R = mdptoolbox.example.forest()
fh = mdptoolbox.mdp.FiniteHorizon(P, R, 0.9, 3)
fh.setVerbose()
fh.run()

stage: 2, policy: [0, 1, 0]
stage: 1, policy: [0, 0, 0]
stage: 0, policy: [0, 0, 0]


In [6]:
fh.policy

array([[0, 0, 0],
       [0, 0, 1],
       [0, 0, 0]])

In [7]:
P

array([[[0.1, 0.9, 0. ],
        [0.1, 0. , 0.9],
        [0.1, 0. , 0.9]],

       [[1. , 0. , 0. ],
        [1. , 0. , 0. ],
        [1. , 0. , 0. ]]])

In [None]:
num_action= 4
num_steps = 7

num_state = 


In [3]:
R

array([[0., 0.],
       [0., 1.],
       [4., 2.]])

In [4]:
fh.V

array([[2.6973, 0.81  , 0.    , 0.    ],
       [5.9373, 3.24  , 1.    , 0.    ],
       [9.9373, 7.24  , 4.    , 0.    ]])