In [1]:
import gymnasium as gym  # Replace gym
import numpy as np
import pandas as pd
from gymnasium import spaces  # Replace gym.spaces

class StockTradingEnv(gym.Env):
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()

        self.df = df
        self.current_step = 0
        self.initial_balance = 10000  # Initial capital
        self.balance = self.initial_balance
        self.shares_held = 0
        self.total_shares_sold = 0
        self.total_profit = 0

        # Observation space (Closing price, RSI, MACD, Bollinger Middle Band)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float32)

        # Action space (0: Hold, 1: Buy, 2: Sell)
        self.action_space = spaces.Discrete(3)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)  # Ensure compatibility with gymnasium `reset`
        self.current_step = 0
        self.balance = self.initial_balance
        self.shares_held = 0
        self.total_profit = 0
        return self._next_observation(), {}  # `reset` now needs to return (obs, info)

    def _next_observation(self):
        obs = self.df.iloc[self.current_step][["Close", "RSI", "MACD", "Bollinger_Middle"]].values
        return obs.astype(np.float32)

    def step(self, action):
        prev_price = self.df.iloc[self.current_step]["Close"]
        self.current_step += 1
        terminated = self.current_step >= len(self.df) - 1  # `gymnasium` requires `terminated`
        truncated = False  # Can be set based on max steps, e.g., `truncated = self.current_step > 10000`

        reward = 0
        if action == 1:  # Buy
            shares_bought = self.balance // prev_price
            self.shares_held += shares_bought
            self.balance -= shares_bought * prev_price
        elif action == 2 and self.shares_held > 0:  # Sell
            self.balance += self.shares_held * prev_price
            self.shares_held = 0
            self.total_profit += self.balance - self.initial_balance
            reward = self.total_profit  # Use total profit as reward

        obs = self._next_observation()
        return obs, reward, terminated, truncated, {}  # `gymnasium` step needs to return 5 values

    def render(self):
        print(f"Step: {self.current_step}, Balance: {self.balance}, Profit: {self.total_profit}")


In [2]:
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym

# Read Data
df = pd.read_csv("/home/jesse/Projects/RL_Testing/Q_Learning/Training/NVDA_Preprocessed.csv", index_col="Date", parse_dates=True)

# Create env with `gymnasium.make` 
env = make_vec_env(lambda: StockTradingEnv(df), n_envs=1)

# Train DQN Agents
model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)

# Save model
model.save("Q1_trading_model")


Using cuda device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5e+03    |
|    ep_rew_mean      | 1.65e+08 |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 392      |
|    time_elapsed     | 50       |
|    total_timesteps  | 19996    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.89e+04 |
|    n_updates        | 4973     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5e+03    |
|    ep_rew_mean      | 9.51e+07 |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 484      |
|    time_elapsed     | 82       |
|    total_timesteps  | 39992    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0102   |
| 

In [3]:
aapl_df = pd.read_csv("/home/jesse/Projects/RL_Testing/Q_Learning/Testing/AAPL_Preprocessed.csv")

env = StockTradingEnv(aapl_df)
obs, _ = env.reset()  #  'reset' in 'gymnasium' that returns (obs, info)

for _ in range(len(aapl_df)):
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, _ = env.step(action)  # 'gymnasium' requires 5 returned values
    env.render()
    if terminated or truncated:
        break


Step: 1, Balance: 0.010709583746574936, Profit: 0
Step: 2, Balance: 0.010709583746574936, Profit: 0
Step: 3, Balance: 0.010709583746574936, Profit: 0
Step: 4, Balance: 0.010709583746574936, Profit: 0
Step: 5, Balance: 0.010709583746574936, Profit: 0
Step: 6, Balance: 0.010709583746574936, Profit: 0
Step: 7, Balance: 0.010709583746574936, Profit: 0
Step: 8, Balance: 0.010709583746574936, Profit: 0
Step: 9, Balance: 0.010709583746574936, Profit: 0
Step: 10, Balance: 0.010709583746574936, Profit: 0
Step: 11, Balance: 0.010709583746574936, Profit: 0
Step: 12, Balance: 0.010709583746574936, Profit: 0
Step: 13, Balance: 0.010709583746574936, Profit: 0
Step: 14, Balance: 0.010709583746574936, Profit: 0
Step: 15, Balance: 0.010709583746574936, Profit: 0
Step: 16, Balance: 0.010709583746574936, Profit: 0
Step: 17, Balance: 0.010709583746574936, Profit: 0
Step: 18, Balance: 0.010709583746574936, Profit: 0
Step: 19, Balance: 0.010709583746574936, Profit: 0
Step: 20, Balance: 0.010709583746574936,