In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd
from gymnasium import spaces  # Compatible with gymnasium

class StockTradingEnv(gym.Env):
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()

        self.df = df  # Stock data passed in
        self.current_step = 0  # Current time step
        self.initial_balance = 10000  # Initial capital
        self.balance = self.initial_balance  # Account balance
        self.shares_held = 0  # Number of shares held
        self.account_history = []  # Record account balance changes

        # Observation space (Closing price, RSI, MACD, Bollinger Middle Band)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float32)

        # Action space (0: Hold, 1: Buy, 2: Sell)
        self.action_space = spaces.Discrete(3)
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)  # Ensure compatibility with gymnasium's `reset`
        self.current_step = 0
        self.balance = self.initial_balance
        self.shares_held = 0
        self.account_history = [self.initial_balance]
        return self._next_observation(), {}

    def _next_observation(self):
        """
        Get the current observation (Closing price, RSI, MACD, Bollinger Middle Band)
        """
        obs = self.df.iloc[self.current_step][["Close", "RSI", "MACD", "Bollinger_Middle"]].values
        return obs.astype(np.float32)
    
    def step(self, action):
        prev_price = self.df.iloc[self.current_step]["Close"]
        prev_value = self.balance + self.shares_held * prev_price  # Compute previous total asset value
        self.current_step += 1
        terminated = self.current_step >= len(self.df) - 1
        truncated = False  # Can be set based on maximum steps

        if action == 1:  # Buy
            shares_bought = self.balance // prev_price
            self.shares_held += shares_bought
            self.balance -= shares_bought * prev_price
        elif action == 2 and self.shares_held > 0:  # Sell
            self.balance += self.shares_held * prev_price
            self.shares_held = 0

        current_price = self.df.iloc[self.current_step]["Close"]
        current_value = self.balance + self.shares_held * current_price  # Current total asset value
        reward = current_value - prev_value  # Asset change as reward

        # Compute Sharpe ratio to measure return stability
        self.account_history.append(current_value)
        returns = np.diff(self.account_history) / self.account_history[:-1] if len(self.account_history) > 1 else [0]
        sharpe_ratio = np.mean(returns) / (np.std(returns) + 1e-8) if len(returns) > 1 else 0
        reward += sharpe_ratio * 10  # Scale up the impact of the Sharpe ratio

        # Introduce time discount factor to encourage long-term holding
        gamma = 0.99
        future_reward = self.balance + self.shares_held * current_price
        reward = gamma * future_reward + (1 - gamma) * reward

        obs = self._next_observation()
        return obs, reward, terminated, truncated, {}

    def render(self):
        """
        Print current state information
        """
        print(f"Step: {self.current_step}, Balance: {self.balance}, Held Shares: {self.shares_held}, Total Value: {self.balance + self.shares_held * self.df.iloc[self.current_step]['Close']}")


In [2]:
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym

# Read data
df = pd.read_csv("/home/jesse/Projects/RL_Testing/Q_Learning/Training/NVDA_Preprocessed.csv", index_col="Date", parse_dates=True)

# Use `gymnasium.make` to create the environment
env = make_vec_env(lambda: StockTradingEnv(df), n_envs=1)

# Train the DQN agent
model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)

# Save the trained model
model.save("Q2_trading_model")


Using cuda device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5e+03    |
|    ep_rew_mean      | 6.63e+07 |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 353      |
|    time_elapsed     | 56       |
|    total_timesteps  | 19996    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 4.57e+03 |
|    n_updates        | 4973     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5e+03    |
|    ep_rew_mean      | 6.15e+07 |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 328      |
|    time_elapsed     | 121      |
|    total_timesteps  | 39992    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 4.45e+03 |
| 

In [3]:
aapl_df = pd.read_csv("/home/jesse/Projects/RL_Testing/Q_Learning/Testing/AAPL_Preprocessed.csv")

env = StockTradingEnv(aapl_df)
obs, _ = env.reset()  # `gymnasium`'s `reset` now returns (obs, info)

for _ in range(len(aapl_df)):
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, _ = env.step(action)  # `gymnasium` requires 5 return values
    env.render()
    if terminated or truncated:
        break


Step: 1, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 2, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 3, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 4, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 5, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 6, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 7, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 8, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 9, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 10, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 11, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 12, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 13, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 14, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 15, Balance: 10000, Held Shares: 0, Total Value: 10000.0
Step: 16, Balance: 10000, Held Shares: 0, Total Value: 10000.0
S