In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class SpeculationVsLaborEnv(gym.Env):
    def __init__(self,
                 max_age=365*80,
                 labor_income=100,
                 labor_health_cost=2,
                 p_win=0.4,
                 speculative_win_return=500,
                 speculative_loss=400,
                 invest_health_cost=1,
                 rest_health_gain=3,
                 r_save=0.0001,   # daily saving interest
                 r_debt=0.0005):  # daily debt interest
        super(SpeculationVsLaborEnv, self).__init__()

        # --- Parameters ---
        self.max_age = max_age
        self.labor_income = labor_income
        self.labor_health_cost = labor_health_cost
        self.p_win = p_win
        self.speculative_win_return = speculative_win_return
        self.speculative_loss = speculative_loss
        self.invest_health_cost = invest_health_cost
        self.rest_health_gain = rest_health_gain
        self.r_save = r_save
        self.r_debt = r_debt

        # --- Action space: 0 = Labor, 1 = Speculate, 2 = Rest ---
        self.action_space = spaces.Discrete(3)

        # --- Observation space: [wealth, health, age, debt, savings] ---
        high = np.array([1e9, 100, max_age, 1e9, 1e9], dtype=np.float32)
        low  = np.array([-1e9, 0, 0, 0, 0], dtype=np.float32)
        self.observation_space = spaces.Box(low, high, dtype=np.float32)

        # Initialize state
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.wealth = 0.0
        self.health = 100.0
        self.age = 0
        self.debt = 0.0
        self.savings = 0.0

        # --- Reset action counters ---
        self.work_count = 0
        self.speculate_count = 0
        self.rest_count = 0

        obs = self._get_obs()
        return obs

    def _get_obs(self):
        return np.array([self.wealth, self.health, self.age, self.debt, self.savings], dtype=np.float32)

    def step(self, action):
        reward = 0

        # --- Take action ---
        if action == 0:  # Labor
            self.wealth += self.labor_income
            self.health -= self.labor_health_cost
            self.work_count += 1

        elif action == 1:  # Speculate
            if np.random.rand() < self.p_win:
                self.wealth += self.speculative_win_return
            else:
                self.wealth -= self.speculative_loss
            self.health -= self.invest_health_cost
            self.speculate_count += 1

        elif action == 2:  # Rest
            self.health = min(100, self.health + self.rest_health_gain)
            self.rest_count += 1

        # --- Daily financial updates ---
        if self.wealth >= 0:
            self.savings = self.wealth
            self.savings *= (1 + self.r_save)
            self.wealth = self.savings
        else:
            self.debt = -self.wealth
            self.debt *= (1 + self.r_debt)
            self.wealth = -self.debt

        # --- Advance time ---
        self.age += 1

        # --- Reward: use current wealth (you could also use delta) ---
        reward = self.wealth

        # --- Termination conditions ---
        terminated = self.age >= self.max_age or self.health <= 0 or self.wealth < -1e6

        obs = self._get_obs()

        # --- Info dictionary with counters ---
        info = {
            "work_count": self.work_count,
            "speculate_count": self.speculate_count,
            "rest_count": self.rest_count
        }

        # --- Episode stats (only when terminated) ---
        if terminated:
            info["episode"] = {
                "r": self.wealth,                    # 这里是最终财富
                "l": self.age,                       # 存活天数
                "days_worked": self.work_count,
                "days_speculated": self.speculate_count,
                "days_rested": self.rest_count,
                "final_age": self.age / 365,
                "final_wealth": self.wealth
            }

        return obs, reward, terminated, False, info



In [14]:
from stable_baselines3.common.callbacks import BaseCallback

class EpisodeLoggerCallback(BaseCallback):
    def __init__(self, verbose=1):
        super(EpisodeLoggerCallback, self).__init__(verbose)
        self.episode_logs = []   # 存放每个 episode 的结果

    def _on_step(self) -> bool:
        # infos 是字典 list（每个并行环境一个）
        for info in self.locals["infos"]:
            if "episode" in info.keys():
                ep = info["episode"]
                record = {
                    "r": ep.get("r", 0),                 # 总回报（这里是 final wealth）
                    "l": ep.get("l", 0),                 # 存活时长（天数）
                    "t": self.num_timesteps,             # 全局步数
                    "days_worked": ep.get("days_worked", 0),
                    "days_speculated": ep.get("days_speculated", 0),
                    "days_rested": ep.get("days_rested", 0),
                    "final_age": ep.get("final_age", 0),
                    "final_wealth": ep.get("final_wealth", 0)
                }
                self.episode_logs.append(record)

                if self.verbose > 0:
                    print(f"[EpisodeLogger] steps={record['l']} "
                          f"return={record['r']:.2f} "
                          f"worked={record['days_worked']} "
                          f"speculated={record['days_speculated']} "
                          f"rested={record['days_rested']} "
                          f"age={record['final_age']:.2f} "
                          f"wealth={record['final_wealth']:.2f} "
                          f"total_steps={record['t']}")
        return True


In [15]:
import numpy as np
import pandas as pd
import gymnasium as gym
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from sb3_contrib import RecurrentPPO

# === 1) 创建环境（按天推进；max_age 用天数表示：80 年 = 80*365 天） ===
def make_env():
    # 这里请确保你上一格的环境类名和参数一致（按天计息/计损耗）
    return SpeculationVsLaborEnv(
        max_age=80*365,          # 80年寿命上限（按天）
        labor_income=100,        # 每天劳动的固定收益（可按需改）
        labor_health_cost=2,     # 每天劳动健康消耗
        p_win=0.4,               # 投机获胜概率
        speculative_win_return=500,  # 投机成功时收益（每天一次决策）
        speculative_loss=400,        # 投机失败时损失
        invest_health_cost=1,    # 投机健康消耗
        rest_health_gain=3,      # 休息每天恢复的健康值
        r_save=0.0001,           # 存款日利率
        r_debt=0.0005            # 债务日利率
    )

# VecEnv + 监控（RecurrentPPO 需要 VecEnv；Monitor 能记录回合统计信息）
env = DummyVecEnv([make_env])
env = VecMonitor(env, filename=None)

# === 2) 定义并创建 RecurrentPPO（LSTM 策略） ===
# 设备建议用CPU（Mlp/LSTM的小模型在CPU上更高效）
policy_kwargs = dict(
    lstm_hidden_size=128,   # LSTM隐藏层（可调小到64以更省资源）
    n_lstm_layers=1,        # LSTM层数
    shared_lstm=False       # Actor/Critic是否共享LSTM
)

model = RecurrentPPO(
    policy="MlpLstmPolicy",
    env=env,
    verbose=1,
    device="cuda",           # GF63 建议先用 CPU；如果你想用GPU可改 "cuda"
    n_steps=1024,           # rollout 长度（越大越稳定，但显存/内存占用也更大）
    batch_size=256,         # 训练batch
    learning_rate=3e-4,
    gamma=0.999,            # 每日折扣，贴近“长期回报”
    gae_lambda=0.95,
    ent_coef=0.0,
    clip_range=0.2,
    policy_kwargs=policy_kwargs,
    seed=42
)

# === 3) 训练 ===
years_to_simulate = 40
steps_per_episode = years_to_simulate * 365
episodes = 3
total_timesteps = steps_per_episode * episodes

print(f"开始训练，总步数：{total_timesteps}")

episode_logger = EpisodeLoggerCallback(verbose=1)

model.learn(
    total_timesteps=total_timesteps,
    callback=episode_logger
)

print("训练完成 ✅")
print("一共记录了", len(episode_logger.episode_logs), "个 episode")
print(pd.DataFrame(episode_logger.episode_logs).head())

# 假设你已经有了 DataFrame
df = pd.DataFrame(episode_logger.episode_logs)

# 保存到 CSV 文件
csv_path = "episode_logs.csv"  # 你可以改成任意路径或文件名
df.to_csv(csv_path, index=False)  # index=False 避免保存行号

print(f"已保存到 CSV 文件：{csv_path}")


Using cuda device
开始训练，总步数：43800
-----------------------------
| time/              |      |
|    fps             | 1196 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1024 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 705           |
|    iterations           | 2             |
|    time_elapsed         | 2             |
|    total_timesteps      | 2048          |
| train/                  |               |
|    approx_kl            | 1.1234079e-08 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.1          |
|    explained_variance   | -5.96e-07     |
|    learning_rate        | 0.0003        |
|    loss                 | 4.1e+10       |
|    n_updates            | 10            |
|    policy_gradient_loss | 1.02e-07      |
|    value_loss           | 1.59e+11      |
---------

In [5]:
# === 4) 测试 & 保存结果（带 LSTM 状态） ===
results = []
test_env = make_env()

num_eval_episodes = 200  # 评估 200 次人生

for ep in range(num_eval_episodes):
    obs, _ = test_env.reset()
    done = False
    total_reward = 0.0

    # RecurrentPPO 需要跟踪 LSTM state 与 episode_start
    lstm_state = None
    episode_start = True

    work_count = 0
    invest_count = 0
    rest_count = 0

    while not done:
        action, lstm_state = model.predict(
            obs,
            state=lstm_state,
            episode_start=np.array([episode_start]),
            deterministic=True
        )
        episode_start = False  # 只有 reset 后的第一步是 True

        if action == 0:
            work_count += 1
        elif action == 1:
            invest_count += 1
        else:
            rest_count += 1

        obs, reward, done, truncated, info = test_env.step(action)
        total_reward += float(reward)

        # Gymnasium 风格：terminated 或 truncated 任一为真都算结束
        done = bool(done or truncated)

    wealth, health, age, debt, savings = obs.astype(float)

    results.append({
        "Episode": ep,
        "Final_Wealth": wealth,
        "Final_Health": health,
        "Final_Age_years": age / 365.0,  # 年龄换成年
        "Debt": debt,
        "Savings": savings,
        "Total_Reward": total_reward,
        "Work_Count": work_count,
        "Invest_Count": invest_count,
        "Rest_Count": rest_count
    })

df = pd.DataFrame(results)
df.to_csv("svl_lstm_eval.csv", index=False)
print("已保存到 svl_lstm_eval.csv")
df.head()


已保存到 svl_lstm_eval.csv


Unnamed: 0,Episode,Final_Wealth,Final_Health,Final_Age_years,Debt,Savings,Total_Reward,Work_Count,Invest_Count,Rest_Count
0,0,5012.770996,0.0,0.136986,0.0,5012.770996,127721.270985,50,0,0
1,1,5012.770996,0.0,0.136986,0.0,5012.770996,127721.270985,50,0,0
2,2,5012.770996,0.0,0.136986,0.0,5012.770996,127721.270985,50,0,0
3,3,5012.770996,0.0,0.136986,0.0,5012.770996,127721.270985,50,0,0
4,4,5012.770996,0.0,0.136986,0.0,5012.770996,127721.270985,50,0,0
