In [1]:
import pandas as pd

file_path = "/home/jesse/Projects/RL_Testing/LSTM_Attention/combined_10_stocks_data.csv"
df = pd.read_csv(file_path)

df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# 确保其他列是数值类型，去掉可能的字符串类型列（例如日期）
df = df.apply(pd.to_numeric, errors='coerce')

# 检查数据的头部
print(df.head())

            TSLA_Open  TSLA_High  TSLA_Low  TSLA_Close  TSLA_Volume  \
Date                                                                  
2010-07-19   1.424667   1.483333  1.394667    1.460667   37297500.0   
2010-07-20   1.456667   1.456667  1.336667    1.353333   27379500.0   
2010-07-21   1.377333   1.393333  1.300000    1.348000   18787500.0   
2010-07-22   1.366667   1.416667  1.358000    1.400000   14367000.0   
2010-07-23   1.412667   1.437333  1.404000    1.419333    9804000.0   

            TSLA_RSI_7  TSLA_RSI_14  TSLA_MACD  TSLA_MACD_Signal  AAPL_Open  \
Date                                                                          
2010-07-19   92.220196    43.408794  -0.064685         -0.083391   7.519788   
2010-07-20   71.260949    39.206237  -0.058209         -0.078355   7.309736   
2010-07-21   74.198423    39.159154  -0.052897         -0.073263   7.977515   
2010-07-22   72.916658    46.915168  -0.043984         -0.067407   7.754518   
2010-07-23   65.010249    57

In [2]:
rows, cols = df.shape
print(f"DataFrame 共有 {rows} 行, {cols} 列")


DataFrame 共有 2633 行, 90 列


In [3]:
# 检查数据的列名
print(df.columns)

# 检查是否有缺失值
print(df.isnull().sum())


Index(['TSLA_Open', 'TSLA_High', 'TSLA_Low', 'TSLA_Close', 'TSLA_Volume',
       'TSLA_RSI_7', 'TSLA_RSI_14', 'TSLA_MACD', 'TSLA_MACD_Signal',
       'AAPL_Open', 'AAPL_High', 'AAPL_Low', 'AAPL_Close', 'AAPL_Volume',
       'AAPL_RSI_7', 'AAPL_RSI_14', 'AAPL_MACD', 'AAPL_MACD_Signal', 'GE_Open',
       'GE_High', 'GE_Low', 'GE_Close', 'GE_Volume', 'GE_RSI_7', 'GE_RSI_14',
       'GE_MACD', 'GE_MACD_Signal', 'QQQ_Open', 'QQQ_High', 'QQQ_Low',
       'QQQ_Close', 'QQQ_Volume', 'QQQ_RSI_7', 'QQQ_RSI_14', 'QQQ_MACD',
       'QQQ_MACD_Signal', 'NVDA_Open', 'NVDA_High', 'NVDA_Low', 'NVDA_Close',
       'NVDA_Volume', 'NVDA_RSI_7', 'NVDA_RSI_14', 'NVDA_MACD',
       'NVDA_MACD_Signal', 'UNH_Open', 'UNH_High', 'UNH_Low', 'UNH_Close',
       'UNH_Volume', 'UNH_RSI_7', 'UNH_RSI_14', 'UNH_MACD', 'UNH_MACD_Signal',
       'CAT_Open', 'CAT_High', 'CAT_Low', 'CAT_Close', 'CAT_Volume',
       'CAT_RSI_7', 'CAT_RSI_14', 'CAT_MACD', 'CAT_MACD_Signal', 'AMZN_Open',
       'AMZN_High', 'AMZN_Low', 'AMZN_

In [4]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd

class TradingEnv(gym.Env):
    metadata = {"render_modes": ["human"], "render_fps": 30}
    
    def __init__(self, df, window_size=10, initial_balance=10000, max_drawdown=0.2, risk_free_rate=0.01, transaction_cost=0.001):
        super(TradingEnv, self).__init__()
        
        self.df = df
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.max_drawdown = max_drawdown
        self.risk_free_rate = risk_free_rate
        self.transaction_cost = transaction_cost  # 每笔交易的手续费比例
        
        # 获取股票代码列表
        self.tickers = sorted(set(col.split('_')[0] for col in df.columns if '_' in col))
        self.num_stocks = len(self.tickers)
        self.features_per_stock = 9  # 9个特征 (Open, High, Low, Close, Volume, RSI_7, RSI_14, MACD, MACD_Signal)
        
        # 定义状态空间 (window_size 天 * 10 只股票 * 9 特征 + 账户状态)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(window_size * 90,), dtype=np.float32)


        # 动作空间 (-1 ~ 1 代表买入 / 卖出比例)
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.num_stocks,), dtype=np.float32)

        self.reset()
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.shares_held = np.zeros(self.num_stocks)  # 每只股票的持仓
        self.total_profit = 0
        self.peak_value = self.initial_balance  # 初始的峰值（用于回撤计算）
        self.done = False
        self.history = []  # 资产历史
        
        return self._next_observation(), {}
    
    def _next_observation(self):
        obs = self.df.iloc[self.current_step - self.window_size:self.current_step].values.flatten()
        return obs

        
    def step(self, action):
        if self.done:
            return self._next_observation(), 0, True, False, {}

        prev_balance = self.balance
        prev_total_value = self.balance + np.sum(self.shares_held * self.df.iloc[self.current_step][[f"{t}_Close" for t in self.tickers]])
        
        # 执行买入/卖出操作
        for i, ticker in enumerate(self.tickers):
            current_price = self.df.iloc[self.current_step][f"{ticker}_Close"]
            cost = abs(action[i]) * current_price * self.transaction_cost  # 交易成本
            
            if action[i] > 0:  # 买入
                max_shares = self.balance // (current_price + cost)  # 计算最多能买多少股
                self.shares_held[i] += max_shares * action[i]
                self.balance -= max_shares * action[i] * current_price + cost
                
            elif action[i] < 0:  # 卖出
                shares_to_sell = min(abs(action[i] * self.shares_held[i]), self.shares_held[i])  # 不能卖空
                self.shares_held[i] -= shares_to_sell
                self.balance += shares_to_sell * current_price - cost

        # 计算新资产总值
        total_value = self.balance + np.sum(self.shares_held * self.df.iloc[self.current_step][[f"{t}_Close" for t in self.tickers]])
        daily_return = (total_value - prev_total_value) / prev_total_value
        self.total_profit = total_value - self.initial_balance
        
        # 计算回撤
        self.peak_value = max(self.peak_value, total_value)
        drawdown = (self.peak_value - total_value) / self.peak_value if self.peak_value > 0 else 0
        
        # 计算夏普率
        self.history.append(total_value)
        sharpe_ratio = (np.mean(self.history[-30:]) - self.risk_free_rate) / (np.std(self.history[-30:]) + 1e-6) if len(self.history) > 1 else 0
        
        # 奖励函数
        reward = (
            self.total_profit * 0.3  # 总利润
            + daily_return * 0.3  # 每日收益
            - max(drawdown - self.max_drawdown, 0) * 10  # 超过最大回撤的惩罚
            + sharpe_ratio * 0.2  # 夏普比率奖励
        )
        
        # 结束条件
        self.current_step += 1
        if self.current_step >= len(self.df) - 1:
            self.done = True
        
        return self._next_observation(), reward, self.done, False, {}
    
    def render(self, mode="human"):
        print(f'Step: {self.current_step}, Balance: {self.balance:.2f}, Shares: {self.shares_held}, Profit: {self.total_profit:.2f}, Drawdown: {self.peak_value - (self.balance + np.sum(self.shares_held * self.df.iloc[self.current_step][[f"{t}_Close" for t in self.tickers]])):.2f}')


In [5]:
import gymnasium as gym
import numpy as np

# 假设 df 是你已经加载并预处理好的 DataFrame，包含所有需要的特征
# df = pd.read_csv('your_stock_data.csv')

# 创建环境实例
env = TradingEnv(df)  # 使用你的数据

# 测试环境初始化
print("Observation Space:", env.observation_space)
print("Action Space:", env.action_space)

# 测试 reset 和 _next_observation
obs, info = env.reset()
print("Initial Observation:", obs)


Observation Space: Box(-inf, inf, (900,), float32)
Action Space: Box(-1.0, 1.0, (10,), float32)
Initial Observation: [ 1.42466700e+00  1.48333299e+00  1.39466703e+00  1.46066701e+00
  3.72975000e+07  9.22201958e+01  4.34087939e+01 -6.46850475e-02
 -8.33913485e-02  7.51978768e+00  7.51978768e+00  7.21042568e+00
  7.39038467e+00  1.02447800e+09  1.41131177e+01  2.77687992e+01
 -6.74227762e-02 -2.62854937e-02  5.17271821e+01  5.20805074e+01
  5.11265207e+01  5.16565170e+01  1.02396750e+07  4.16002405e+01
  4.42772806e+01 -9.42787484e-01 -1.24506084e+00  3.90347177e+01
  3.93331618e+01  3.87099381e+01  3.92541618e+01  8.18008000e+07
  5.85528404e+01  4.73685939e+01 -1.46860036e-01 -2.96142211e-01
  2.33627455e-01  2.40734842e-01  2.32022536e-01  2.39817768e-01
  7.98440000e+08  5.35545608e+01  4.24170504e+01 -1.02271684e-02
 -1.21285023e-02  2.44886240e+01  2.48742713e+01  2.43359730e+01
  2.47617912e+01  8.57550000e+06  7.56757364e+01  6.51205698e+01
  9.71859826e-02 -5.71592235e-02  4.45

In [6]:
# 测试执行一步操作
action = np.random.uniform(-1, 1, size=(env.num_stocks,))  # 随机生成动作
print("Test Action:", action)

# 进行一步操作并获取返回的观察值、奖励等
obs, reward, done, truncated, info = env.step(action)
print("New Observation:", obs)
print("Reward:", reward)
print("Done:", done)


Test Action: [ 0.84706677 -0.52458539 -0.468915   -0.3759919   0.99908207  0.68401213
 -0.53997152  0.08707199  0.9775846   0.69108824]
New Observation: [ 1.45666695e+00  1.45666695e+00  1.33666694e+00  1.35333300e+00
  2.73795000e+07  7.12609494e+01  3.92062370e+01 -5.82086295e-02
 -7.83548047e-02  7.30973612e+00  7.61067180e+00  7.22276563e+00
  7.58027697e+00  1.07495080e+09  3.25978810e+01  4.52738465e+01
 -6.54673290e-02 -3.41218608e-02  5.07378649e+01  5.30344933e+01
  5.07378649e+01  5.27871628e+01  1.40386600e+07  4.96552612e+01
  5.73717565e+01 -8.47157186e-01 -1.16548011e+00  3.86221343e+01
  3.97281303e+01  3.84992463e+01  3.97281303e+01  8.90612000e+07
  6.01265797e+01  6.52172300e+01 -9.48656587e-02 -2.55886900e-01
  2.33627428e-01  2.47154409e-01  2.31563973e-01  2.45778799e-01
  1.16098800e+09  5.79399228e+01  5.31088026e+01 -9.26366088e-03
 -1.15555340e-02  2.51876103e+01  2.53000903e+01  2.42957997e+01
  2.47698250e+01  1.34939000e+07  7.21649522e+01  7.27272237e+01
  

In [7]:
# 测试多步操作
for i in range(10):
    action = np.random.uniform(-1, 1, size=(env.num_stocks,))
    obs, reward, done, truncated, info = env.step(action)
    print(f"Step {i+1}, Reward: {reward}, Total Profit: {env.total_profit}, Done: {done}")
    if done:
        break


Step 1, Reward: 114.02103793572687, Total Profit: -32.37864490492757, Done: False
Step 2, Reward: 70.92140961613643, Total Profit: 58.018026098276096, Done: False
Step 3, Reward: 72.28141653000134, Total Profit: 136.91270304669888, Done: False
Step 4, Reward: 46.03261752728234, Total Profit: 37.1792069100793, Done: False
Step 5, Reward: 63.46858076169917, Total Profit: 91.97918832482355, Done: False
Step 6, Reward: 40.494562339562385, Total Profit: 10.046281758581245, Done: False
Step 7, Reward: -92.97771463053874, Total Profit: -357.00781929613004, Done: False
Step 8, Reward: -133.2728451011142, Total Profit: -477.51632488226824, Done: False
Step 9, Reward: -117.51281228044719, Total Profit: -421.9907415643029, Done: False
Step 10, Reward: -96.67213324529105, Total Profit: -352.100938912301, Done: False


In [8]:
# 测试环境是否能正常结束
env.reset()
while not env.done:
    action = np.random.uniform(-1, 1, size=(env.num_stocks,))
    obs, reward, done, truncated, info = env.step(action)
    print(f"Balance: {env.balance}, Total Profit: {env.total_profit}")


Balance: 1203.628861503167, Total Profit: -0.1182191128427803
Balance: 6.965186267659941, Total Profit: -19.551445100209094
Balance: 23.823081122382415, Total Profit: 35.04400338297455
Balance: 26.593218002033314, Total Profit: -3.6001786821343558
Balance: 41.56418545082458, Total Profit: -50.67120948665979
Balance: 139.90693425350324, Total Profit: 3.5755115997162648
Balance: 121.32363156656845, Total Profit: -88.24193520835252
Balance: 1207.362932281526, Total Profit: -442.5518995360908
Balance: 32.235627197708325, Total Profit: -430.5303587544331
Balance: 88.928245591193, Total Profit: -417.7958212843314
Balance: 2359.7155605774333, Total Profit: -435.2549035745178
Balance: 34.56684238756047, Total Profit: -290.23989015378174
Balance: 1817.9209780313686, Total Profit: -232.94565604514355
Balance: 843.3664740946504, Total Profit: -285.16313184174214
Balance: 57.0472720305155, Total Profit: -281.4713399710654
Balance: 240.1326454612619, Total Profit: -145.34040464335885
Balance: 1489.

In [9]:
from stable_baselines3 import PPO

# 创建模型
model = PPO("MlpPolicy", env, verbose=1)

# 训练模型
model.learn(total_timesteps=10000)

# 测试训练结果
obs, _ = env.reset()
for i in range(10):  # 运行10步
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env.step(action)
    print(f"Step {i+1}, Reward: {reward}, Total Profit: {env.total_profit}")
    if done:
        break


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 369  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 2.62e+03     |
|    ep_rew_mean          | 2.01e+07     |
| time/                   |              |
|    fps                  | 341          |
|    iterations           | 2            |
|    time_elapsed         | 12           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 2.997904e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -14.2        |
|    explained_variance   | -2.74e-06    |
|    learning_rate        | 0.0003       |
|    lo

In [12]:
file_path = "/home/jesse/Projects/RL_Testing/LSTM_Attention/combined_10_stocks_data.csv"

df_check = pd.read_csv(file_path)

rows, cols = df_check.shape
print(f"DataFrame 共有 {rows} 行, {cols} 列")


DataFrame 共有 2633 行, 91 列
