In [None]:
from collections import defaultdict
from dataclasses import dataclass

# --- 你的动作数据结构 ---
@dataclass
class Scroll:
    name: str
    success_p: float
    atk_value: int
    destroy_on_fail_p: float  # 失败时爆炸概率

SCROLL_A = Scroll("A_10p_+3", success_p=0.11, atk_value=3, destroy_on_fail_p=0.0)
SCROLL_B = Scroll("B_60p_+2", success_p=0.66, atk_value=2, destroy_on_fail_p=0.0)
SCROLL_C = Scroll("C_30p_+3_boom50", success_p=0.33, atk_value=3, destroy_on_fail_p=0.50)
SCROLL_D = Scroll("D_70p_+2_boom50", success_p=0.77, atk_value=2, destroy_on_fail_p=0.50)
SCROLL_SET = {"A": SCROLL_A, "B": SCROLL_B, "C": SCROLL_C, "D": SCROLL_D}

# --- 奖励设计（示例，可按你的业务修改） ---
# 每次卷轴成本
GROVE_SCROLL_PRICE = {
    "A": 33, 
    "B": 597, 
    "C": 14959, 
    "D": 2325
}
# 攻击力的价值
mxb_to_rmb = 56 # 56W mxb = 1rmb
GROVE_ATTACK_PRICE = {
    "10": 1500,
    "11": 2500,
    "12": 5000,
    "13": 14000,
    "14": 41000,
    "15": 2200*mxb_to_rmb,
    "16": 6000*mxb_to_rmb,
    "17": 12500*mxb_to_rmb
}
# CONSTANT
MAX_ATK = 7 * 3
TERMINAL_BROKEN = -1      # boom state value

# --- 状态空间：你可以枚举合理上限（例如 0..MAX_ATK）加上 -1 终止状态 ---


def is_terminal_state(atk: int, t: int=1) -> bool:
    # t == 0 表示没次数了；atk == -1 表示爆炸
    return atk == TERMINAL_BROKEN or t == 0

def reward_function(s: int, a: str, s_next: int) -> float:
    # s: current attack value
    # a: which scroll to use. "A", "B", "C", "D"
    # s_next: next possible attack value
    # scroll: scroll set
    
    # 基础：动作成本为负
    r = -GROVE_SCROLL_PRICE.get(a)
    # 成功增益：以“状态变化带来的攻击力提升”计价
    if s_next >= 10 and s_next > s: # only larger than 10 attack worth a price
        print(s_next)
        r += GROVE_ATTACK_PRICE.get(str(s_next)) - GROVE_ATTACK_PRICE.get(str(s), 0)
    # 爆炸惩罚
    if s_next == TERMINAL_BROKEN:
        r -= GROVE_ATTACK_PRICE.get(str(s), 0)
    return r

# --- 从动作定义构造 MDP 的转移分布 P(s'|s,a) 与期望奖励 R(s,a) ---
def build_transition_and_reward(max_atk: int=MAX_ATK):
    S = set(range(0, max_atk + 1))
    S.add(TERMINAL_BROKEN)  # 爆炸终止

    # A[s] contains all possible actions on state s
    A = defaultdict(dict)
    # P[s][a] 是 dict: s' -> prob
    P = defaultdict(lambda: defaultdict(dict))
    # R[s][a] 期望一步奖励（按 s' 加权后的期望）
    R = defaultdict(lambda: defaultdict(dict))

    for s in S:

        # handle last state
        if is_terminal_state(s):
            continue

        for a, scroll in SCROLL_SET.items():
            succ = scroll.success_p
            boom_on_fail = scroll.destroy_on_fail_p
            fail = 1.0 - succ

            # three possible s_prime
            s_succ = s + scroll.atk_value
            s_fail = s
            s_boom = TERMINAL_BROKEN
            if s_succ > max_atk: # impossible state
                continue
            trans = {
                s_succ: succ,                    # 成功
                s_fail: fail * (1-boom_on_fail), # 失败-未爆
                s_boom: fail * boom_on_fail      # 失败-爆炸
            }
            P[s][a] = trans

            # expected reward
            exp_r = 0.0
            for s_prime, prob in trans.items():
                exp_r += prob * reward_function(s, a, s_prime)
            R[s][a] = exp_r

        # all possible actions
        A[s] = SCROLL_SET

    return S, A, P, R

In [7]:
S, A, P, R = build_transition_and_reward(MAX_ATK)

TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'