In [None]:
import gymnasium as gym
import pandas as pd
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env
from sklearn.preprocessing import StandardScaler

# ✅ 1️⃣ 加载股票数据，并进行预处理
df = pd.read_csv("/home/jesse/Projects/RL_Testing/SAC_Testing/TandT/nvidia_stock_with_indicators.csv",
                 index_col="Date", parse_dates=True)

# ⚠️ 数据预处理：归一化（防止梯度爆炸）
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])  

# ✅ 2️⃣ 定义交易环境
class StockTradingEnv(gym.Env):
    def __init__(self, df, window_size=10, initial_balance=10000, transaction_cost=0.001):
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.transaction_cost = transaction_cost  # 交易成本
        self.current_step = window_size

        # 账户变量
        self.balance = initial_balance
        self.shares_held = 0
        self.total_asset_value = initial_balance

        # 观测空间
        self.feature_columns = [col for col in df.columns if col != "Date"]
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, len(self.feature_columns)), dtype=np.float32
        )

        # 动作空间：-1 (卖出) 到 1 (买入)
        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)

    def _next_observation(self):
        return self.df.iloc[self.current_step - self.window_size:self.current_step][self.feature_columns].values.astype(np.float32)

    def step(self, action):
        prev_asset_value = self.total_asset_value
        current_price = self.df.iloc[self.current_step]["Close"]

        # 计算交易量，考虑交易成本
        trade_amount = action[0] * self.balance  
        if trade_amount > 0:  # 买
            shares_bought = trade_amount / current_price
            self.shares_held += shares_bought
            self.balance -= trade_amount
        elif trade_amount < 0:  # 卖
            shares_sold = min(abs(trade_amount) / current_price, self.shares_held)
            self.shares_held -= shares_sold
            self.balance += shares_sold * current_price

        # 更新总资产
        self.total_asset_value = self.balance + (self.shares_held * current_price)
        
        # 计算奖励（收益率 - 交易成本）
        reward = (self.total_asset_value - prev_asset_value) / prev_asset_value
        cost = self.transaction_cost * abs(action[0])  # 交易成本与交易量成正比
        reward -= cost # I need to work on this.
        ####### the reward is going to always be negative because it is profit rate instead of profit.

        self.current_step += 1
        terminated = self.current_step >= len(self.df) - 1
        truncated = False  

        return self._next_observation(), reward, terminated, truncated, {}

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.shares_held = 0
        self.total_asset_value = self.initial_balance
        return self._next_observation(), {}

    def render(self):
        print(f"Step: {self.current_step}, Balance: {self.balance}, Portfolio Value: {self.total_asset_value}")

# ✅ 3️⃣ 创建多进程训练环境（SAC 可支持并行环境）
def create_env():
    return StockTradingEnv(df)  

env = make_vec_env(create_env, n_envs=1)

print("✅ 交易环境搭建完成！")


✅ 交易环境搭建完成！


In [2]:
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env
import pandas as pd

# ✅ 4️⃣ 训练 SAC 代理
policy_kwargs = dict(net_arch=[512, 512])  # 更大的网络结构
model = SAC(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=1e-6,  # 降低学习率，防止梯度爆炸
    ent_coef=0.01,  # 限制温度系数，避免过度探索
    policy_kwargs=policy_kwargs,
    tensorboard_log="./sac_logs/"
)

# ✅ 5️⃣ 训练 100 万步
model.learn(total_timesteps=100)

# ✅ 6️⃣ 保存模型
model.save("sac_NVDA")

# ✅ 7️⃣ 训练完成后评估
print("🎉 训练完成！模型已保存为 'sac_NVDA' 🎉")




Using cuda device
Logging to ./sac_logs/SAC_7
🎉 训练完成！模型已保存为 'sac_NVDA' 🎉


In [None]:
aapl_df = pd.read_csv("/home/jesse/Projects/RL_Testing/SAC_Testing/TandT//microsoft_stock_with_indicators.csv")

env = StockTradingEnv(aapl_df)
obs, _ = env.reset()  # `gymnasium` 的 `reset` 现在返回 (obs, info)

for _ in range(len(aapl_df)):
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, _ = env.step(action)  # `gymnasium` 需要接收 5 个返回值
    env.render()
    if terminated or truncated:
        break


In [None]:
aapl_df = pd.read_csv("/home/jesse/Projects/RL_Testing/SAC_Testing/TandT//apple_stock_with_indicators.csv")

env = StockTradingEnv(aapl_df)
obs, _ = env.reset()  # `gymnasium` 的 `reset` 现在返回 (obs, info)

for _ in range(len(aapl_df)):
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, _ = env.step(action)  # `gymnasium` 需要接收 5 个返回值
    env.render()
    if terminated or truncated:
        break


In [None]:
import numpy as np

# 加载训练好的模型
model = SAC.load("sac_NVDA")

# 运行测试
obs, _ = env.reset()
done = False
total_reward = 0
while not done:
    action, _ = model.predict(obs, deterministic=True)  # 采用训练好的策略
    obs, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    total_reward += reward

print(f"最终收益: {total_reward}")