In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd
from gymnasium import spaces  # 兼容 gymnasium

class StockTradingEnv(gym.Env):
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()

        self.df = df  # 传入的股票数据
        self.current_step = 0  # 当前时间步
        self.initial_balance = 10000  # 初始资金
        self.balance = self.initial_balance  # 账户余额
        self.shares_held = 0  # 持有股票数
        self.account_history = []  # 记录资金变动

        # 观察空间（收盘价, RSI, MACD, 布林中轨）
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float32)

        # 动作空间（0: 持有, 1: 买入, 2: 卖出）
        self.action_space = spaces.Discrete(3)
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)  # 确保兼容 gymnasium 的 `reset`
        self.current_step = 0
        self.balance = self.initial_balance
        self.shares_held = 0
        self.account_history = [self.initial_balance]
        return self._next_observation(), {}

    def _next_observation(self):
        """
        获取当前的观察值（收盘价, RSI, MACD, 布林中轨）
        """
        obs = self.df.iloc[self.current_step][["Close", "RSI", "MACD", "Bollinger_Middle"]].values
        return obs.astype(np.float32)
    
    def step(self, action):
        prev_price = self.df.iloc[self.current_step]["Close"]
        prev_value = self.balance + self.shares_held * prev_price  # 计算上一步的资产总值
        self.current_step += 1
        terminated = self.current_step >= len(self.df) - 1
        truncated = False  # 可根据最大步数设定

        if action == 1:  # 买入
            shares_bought = self.balance // prev_price
            self.shares_held += shares_bought
            self.balance -= shares_bought * prev_price
        elif action == 2 and self.shares_held > 0:  # 卖出
            self.balance += self.shares_held * prev_price
            self.shares_held = 0

        current_price = self.df.iloc[self.current_step]["Close"]
        current_value = self.balance + self.shares_held * current_price  # 当前资产总值
        reward = current_value - prev_value  # 资产变化作为奖励

        # 计算夏普比率，衡量收益稳定性
        self.account_history.append(current_value)
        returns = np.diff(self.account_history) / self.account_history[:-1] if len(self.account_history) > 1 else [0]
        sharpe_ratio = np.mean(returns) / (np.std(returns) + 1e-8) if len(returns) > 1 else 0
        reward += sharpe_ratio * 10  # 适当放大夏普比率的影响

        # 引入时间折扣因子，鼓励长期持有
        gamma = 0.99
        future_reward = self.balance + self.shares_held * current_price
        reward = gamma * future_reward + (1 - gamma) * reward

        obs = self._next_observation()
        return obs, reward, terminated, truncated, {}

    def render(self):
        """
        打印当前状态信息
        """
        print(f"Step: {self.current_step}, Balance: {self.balance}, Held Shares: {self.shares_held}, Total Value: {self.balance + self.shares_held * self.df.iloc[self.current_step]['Close']}")


In [2]:
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym

# 读取数据
df = pd.read_csv("/home/jesse/Projects/RL_Testing/Q_Learning/Testing/AAPL_Preprocessed.csv", index_col="Date", parse_dates=True)

# 使用 `gymnasium.make` 创建环境
env = make_vec_env(lambda: StockTradingEnv(df), n_envs=1)

# 训练 DQN 代理
model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=2000000)

# 保存训练好的模型
model.save("Q2_AAPL_trading_model")


Using cuda device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.11e+04 |
|    ep_rew_mean      | 2.51e+09 |
|    exploration_rate | 0.79     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 684      |
|    time_elapsed     | 64       |
|    total_timesteps  | 44292    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.29e+05 |
|    n_updates        | 11047    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.11e+04 |
|    ep_rew_mean      | 2.32e+09 |
|    exploration_rate | 0.579    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 633      |
|    time_elapsed     | 139      |
|    total_timesteps  | 88584    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.04e+05 |
| 

In [3]:
aapl_df = pd.read_csv("/home/jesse/Projects/RL_Testing/Q_Learning/Training/NVDA_Preprocessed.csv")

env = StockTradingEnv(aapl_df)
obs, _ = env.reset()  # `gymnasium` 的 `reset` 现在返回 (obs, info)

for _ in range(len(aapl_df)):
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, _ = env.step(action)  # `gymnasium` 需要接收 5 个返回值
    env.render()
    if terminated or truncated:
        break


Step: 1, Balance: 0.18948166072368622, Held Shares: 49709.0, Total Value: 9652.037035554644
Step: 2, Balance: 0.18948166072368622, Held Shares: 49709.0, Total Value: 9266.79206037521
Step: 3, Balance: 0.0030646622180939276, Held Shares: 49710.0, Total Value: 9320.628064662218
Step: 4, Balance: 0.0030646622180939276, Held Shares: 49710.0, Total Value: 9382.76550540328
Step: 5, Balance: 0.0030646622180939276, Held Shares: 49710.0, Total Value: 9217.081959635017
Step: 6, Balance: 0.0030646622180939276, Held Shares: 49710.0, Total Value: 9353.784921854734
Step: 7, Balance: 0.0030646622180939276, Held Shares: 49710.0, Total Value: 9179.799939632412
Step: 8, Balance: 0.0030646622180939276, Held Shares: 49710.0, Total Value: 9059.650220960379
Step: 9, Balance: 0.0030646622180939276, Held Shares: 49710.0, Total Value: 9258.490623921156
Step: 10, Balance: 0.0030646622180939276, Held Shares: 49710.0, Total Value: 9192.22727963328
Step: 11, Balance: 0.0030646622180939276, Held Shares: 49710.0, To