In [None]:
import numpy as np
import pandas as pd

# 参数设定
W_0 = 100      #初始资金100元
r_f = 0.03     #无风险利率3%
R_u = 0.10     #Risky Asset 可能上涨10%
R_d = -0.05    #Risky Asset 可能下跌5%
P_u = 0.6      #上涨概率
P_d = 0.4      #下跌概率
T = 10         
gamma = 0.95   #折扣因子
alpha = 0.05   #学习效率
aversion_rate = 0.04   #CARA中评估因子
n_actions = 11         #我们有11种操作，从买入占总资产0，10%，20%.....100%的Risky Asset
n_episodes = 1000000   #迭代次数
convergence_threshold = 0.01   #在判断是否收敛时的threshold

# 离散化财富
W_max = 280
n_W = 300
W_grid = np.linspace(40, W_max, n_W)  # 因为我们每步最多涨10%，根据计算10步之后的最大值为259，最小值为56，所以这样设置财富网格是合理的

# 初始化 Q 表
Q = np.zeros((T+1, n_W, n_actions))
Q_990000 = np.zeros((T+1, n_W, n_actions)) #记录在末尾的两个Q函数，之后进行在T=0,W=100时的对比，看看这两个Q函数是否收敛
Q_1000000 = np.zeros((T+1, n_W, n_actions))

# CARA 效用函数
def cara_utility(W, aversion_rate):
    return -np.exp(-aversion_rate * W) / aversion_rate#只在最后一步有reward
# 离散化索引
def get_W_idx(W):
    idx = np.searchsorted(W_grid, W)
    return min(idx, n_W-1)

# 模拟财富转移
def next_wealth(W, a):
    if np.random.rand() < P_u:
        R = R_u
    else:
        R = R_d
    W_next = (1 - a) * W * (1 + r_f) + a * W * (1 + R)
    return max(W_next, 1e-6) #计算下一阶段的财富

# Q-Learning 训练
for episode in range(n_episodes):
    W = W_0
    for t in range(T):
        W_idx = get_W_idx(W) #将财富归格化处理
        #在0.1概率以下随机选择action
        a_idx = np.random.randint(0, n_actions) if np.random.rand() < 0.1 else np.argmax(Q[t, W_idx, :])
        a = a_idx / 10.0
        W_next = next_wealth(W, a)
        if t == T-1:
            #最后一步计算reward
            reward = cara_utility(W_next, aversion_rate)
        else:
            reward = 0
        W_next_idx = get_W_idx(W_next)
        if t < T-1:
            next_max_Q = np.max(Q[t+1, W_next_idx, :])
            Q[t, W_idx, a_idx] += alpha * (reward + gamma * next_max_Q - Q[t, W_idx, a_idx])
        else:
            Q[t, W_idx, a_idx] += alpha * (reward - Q[t, W_idx, a_idx])
        W = W_next
    if episode == 989999:
        Q_990000[:] = Q[:]    #取得迭代990000次时候的Q函数
    if episode == 999999:
        Q_1000000[:] = Q[:]   #取得最后迭代一次时候的Q函数

# 收敛性分析：t=0, W ≈ 100
W_idx_100 = get_W_idx(100)
print(f"\nt=0, W ≈ {W_grid[W_idx_100]:.2f} 的 Q 值收敛性分析:")
for a_idx in range(n_actions):
    a = a_idx / 10.0
    q_90000 = Q_990000[0, W_idx_100, a_idx]
    q_100000 = Q_1000000[0, W_idx_100, a_idx]
    diff = abs(q_100000 - q_90000)
    status = "收敛良好" if diff < convergence_threshold else "未充分收敛"
    print(f"动作 a={a:.1f}: Q_900000 = {q_90000:.4f}, Q_1000000 = {q_100000:.4f}, 差值 = {diff:.4f}, {status}")

# 输出 t=0 的最终 Q 值，W ≈ 100
print(f"\nt=0, W ≈ {W_grid[W_idx_100]:.2f} 的最终 Q 值:")
for a_idx in range(n_actions):
    a = a_idx / 10.0
    print(f"动作 a={a:.1f}: Q = {Q[0, W_idx_100, a_idx]:.4f}")

# 输出所有 t 的 Q 值，W 在 59 到 259 之间（部分）
for t in range(T):
    print(f"\nt={t}, 财富 W 在 59 到 259 之间的最终 Q 值（部分）:")
    for w_idx in range(n_W):
        W = W_grid[w_idx]
        if 59 <= W <= 259:
            print(f"\n财富 W={W:.2f}:")
            for a_idx in range(n_actions):
                a = a_idx / 10.0
                print(f"  动作 a={a:.1f}: Q = {Q[t, w_idx, a_idx]:.4f}")
            if W > 61:
                print("（仅展示部分 W 值，后续省略）")
                break

# 保存完整 Q 表到 CSV 文件
data = []
for t in range(T):  # t=0 到 t=9
    for w_idx in range(n_W):
        W = W_grid[w_idx]
        for a_idx in range(n_actions):
            a = a_idx / 10.0
            data.append([t, W, a, Q[t, w_idx, a_idx]])

# 创建 DataFrame 并保存为 CSV
df = pd.DataFrame(data, columns=["Time", "Wealth", "Action", "Q_Value"])
df.to_csv("C:/Users/ASUS/Desktop/Q_final_all_t.csv", index=False)
print("\n已将完整的 Q 表保存到 'Q_final_all_t.csv' 文件中")


t=0, W ≈ 100.20 的 Q 值收敛性分析:
动作 a=0.0: Q_900000 = -0.0702, Q_1000000 = -0.0705, 差值 = 0.0003, 收敛良好
动作 a=0.1: Q_900000 = -0.0706, Q_1000000 = -0.0703, 差值 = 0.0003, 收敛良好
动作 a=0.2: Q_900000 = -0.0699, Q_1000000 = -0.0706, 差值 = 0.0007, 收敛良好
动作 a=0.3: Q_900000 = -0.0703, Q_1000000 = -0.0691, 差值 = 0.0012, 收敛良好
动作 a=0.4: Q_900000 = -0.0706, Q_1000000 = -0.0709, 差值 = 0.0003, 收敛良好
动作 a=0.5: Q_900000 = -0.0722, Q_1000000 = -0.0732, 差值 = 0.0010, 收敛良好
动作 a=0.6: Q_900000 = -0.0729, Q_1000000 = -0.0706, 差值 = 0.0023, 收敛良好
动作 a=0.7: Q_900000 = -0.0702, Q_1000000 = -0.0721, 差值 = 0.0019, 收敛良好
动作 a=0.8: Q_900000 = -0.0706, Q_1000000 = -0.0731, 差值 = 0.0025, 收敛良好
动作 a=0.9: Q_900000 = -0.0687, Q_1000000 = -0.0730, 差值 = 0.0043, 收敛良好
动作 a=1.0: Q_900000 = -0.0707, Q_1000000 = -0.0711, 差值 = 0.0004, 收敛良好

t=0, W ≈ 100.20 的最终 Q 值:
动作 a=0.0: Q = -0.0705
动作 a=0.1: Q = -0.0703
动作 a=0.2: Q = -0.0706
动作 a=0.3: Q = -0.0691
动作 a=0.4: Q = -0.0709
动作 a=0.5: Q = -0.0732
动作 a=0.6: Q = -0.0706
动作 a=0.7: Q = -0.0721
动作 a=0.8: 