In [1]:
# 完整最终版 LifeSimEnv
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd

class LifeSimEnv(gym.Env):
    metadata = {"render.modes": ["human"]}

    def __init__(self,
                 init_cash=10000,
                 init_health=100,
                 max_days=365*80,
                 labor_income=100,
                 labor_health_loss=1,
                 spec_gain=500,
                 spec_loss=700,
                 spec_prob=0.5,
                 spec_health_loss=0.5,
                 rest_recover=2,
                 deposit_rate=0.0001,
                 debt_rate=0.0005):

        super(LifeSimEnv, self).__init__()

        # === 参数 ===
        self.init_cash = init_cash
        self.init_health = init_health
        self.max_days = max_days
        self.labor_income = labor_income
        self.labor_health_loss = labor_health_loss
        self.spec_gain = spec_gain
        self.spec_loss = spec_loss
        self.spec_prob = spec_prob
        self.spec_health_loss = spec_health_loss
        self.rest_recover = rest_recover
        self.deposit_rate = deposit_rate
        self.debt_rate = debt_rate

        # === 状态空间 (现金, 健康, 天数) ===
        self.observation_space = spaces.Box(
            low=np.array([-np.inf, 0, 0], dtype=np.float32),
            high=np.array([np.inf, 100, self.max_days], dtype=np.float32),
            dtype=np.float32
        )

        # === 动作空间 (0=休息, 1=劳动, 2=投机) ===
        self.action_space = spaces.Discrete(3)

        # === 内部状态 ===
        self.cash = None
        self.health = None
        self.day = None

        # 日志
        self.episode_log = None
        self.all_episode_logs = []

    def reset_episode_log(self):
        """初始化单轮 episode 日志"""
        self.episode_log = {
            "days_alive": None,  # None 表示还没活过一天
            "work_days": 0,
            "spec_days": 0,
            "rest_days": 0,
            "final_cash": None
        }

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        # 初始化状态
        self.cash = float(self.init_cash)
        self.health = float(self.init_health)
        self.day = 0

        # 初始化日志
        self.reset_episode_log()

        obs = np.array([self.cash, self.health, self.day], dtype=np.float32)
        return obs, {}

    def step(self, action):
        self.day += 1
        reward = 1.0  # 存活奖励

        # === 行为逻辑 ===
        if action == 0:  # 休息
            self.health = min(100, self.health + self.rest_recover)
            self.episode_log["rest_days"] += 1

        elif action == 1:  # 劳动
            self.cash += self.labor_income
            self.health -= self.labor_health_loss
            self.episode_log["work_days"] += 1

        elif action == 2:  # 投机
            if np.random.rand() < self.spec_prob:
                self.cash += self.spec_gain
            else:
                self.cash -= self.spec_loss
            self.health -= self.spec_health_loss
            self.episode_log["spec_days"] += 1

        # === 利息计算 ===
        if self.cash > 0:
            self.cash *= (1 + self.deposit_rate)
        elif self.cash < 0:
            self.cash *= (1 + self.debt_rate)

        # === 更新日志 ===
        self.episode_log["days_alive"] = self.day
        self.episode_log["final_cash"] = self.cash

        # === 状态 ===
        obs = np.array([self.cash, self.health, self.day], dtype=np.float32)

        # === 检查是否结束 ===
        done = False
        if self.health <= 0 or self.day >= self.max_days:
            done = True
            # ✅ 只在 episode 真正结束时记录日志，避免存 0 值
            self.record_episode()

        truncated = False
        return obs, reward, done, truncated, {}

    def render(self, mode="human"):
        print(f"Day {self.day} | Cash: {self.cash:.2f} | Health: {self.health:.1f}")

    def record_episode(self):
        """保存当前 episode 日志"""
        if self.episode_log is not None and self.episode_log["days_alive"] is not None:
            self.all_episode_logs.append(self.episode_log.copy())

    def save_logs_to_csv(self, filename="life_logs.csv"):
        """保存所有 episode 日志到 CSV"""
        if len(self.all_episode_logs) == 0:
            print("⚠️ 没有日志可保存！")
            return
        df = pd.DataFrame(self.all_episode_logs)
        df.to_csv(filename, index=False)
        print(f"✅ Logs saved to {filename}")


In [2]:
# 第二个 cell: 训练 RecurrentPPO 智能体，并记录每轮人生

import torch
from stable_baselines3 import PPO
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor

# === 1. 包装环境，使用 DummyVecEnv ===
def make_env():
    return LifeSimEnv()

env = DummyVecEnv([make_env])
env = VecMonitor(env)  # 自动记录 episode reward 等信息

# === 2. 定义 RecurrentPPO 智能体 ===
policy_kwargs = dict(
    net_arch=[64, 64],  # LSTM 前的 MLP 层
    lstm_hidden_size=64,
    n_lstm_layers=1,
)

model = RecurrentPPO(
    "MlpLstmPolicy",
    env,
    policy_kwargs=policy_kwargs,
    verbose=1,
    batch_size=32,
    n_steps=128,       # 每个 rollout 的步数
    learning_rate=3e-4,
    gamma=0.99,
    device="cuda"
)

# === 3. 自定义 Callback，用于记录每轮人生 ===
from stable_baselines3.common.callbacks import BaseCallback

class EpisodeLoggerCallback(BaseCallback):
    def __init__(self, verbose=0):
        super().__init__(verbose)

    def _on_rollout_end(self):
        # rollout 结束时，不记录，这里主要在每个 episode done 时记录
        pass

    def _on_step(self) -> bool:
        # 检查每个 env 是否 done
        infos = self.locals.get("infos", [])
        for i, info in enumerate(infos):
            if "episode" in info:
                # episode done，调用环境记录日志
                self.training_env.envs[i].record_episode()
        return True

callback = EpisodeLoggerCallback()

# === 4. 开始训练 ===
total_timesteps = 100_000  # 可以根据算力调整
model.learn(total_timesteps=total_timesteps, callback=callback)

# === 5. 训练结束后，保存模型和日志 ===
model.save("lifesim_recurrentppo")

# 保存日志到 CSV
env.envs[0].save_logs_to_csv("lifesim_training_results.csv")


Using cpu device
----------------------------
| time/              |     |
|    fps             | 793 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 128 |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 257          |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 256          |
| train/                  |              |
|    approx_kl            | 0.0032632817 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.1         |
|    explained_variance   | 7.24e-05     |
|    learning_rate        | 0.0003       |
|    loss                 | 100          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00209     |
|    value_loss           | 194          |
------------------------------------------
------