In [4]:
import gymnasium as gym
import numpy as np
import pandas as pd
from gymnasium import spaces  # Compatible with gymnasium

class StockTradingEnv(gym.Env):
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()

        self.df = df  # Input stock data
        self.current_step = 0  # Current time step
        self.initial_balance = 10000  # Initial capital
        self.balance = self.initial_balance  # Account balance
        self.shares_held = 0  # Number of shares held
        self.account_history = []  # Record balance changes

        # Observation space (including additional technical indicators)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(10, 20), dtype=np.float32)

        # Action space (0: Hold, 1: Buy, 2: Sell)
        self.action_space = spaces.Discrete(3)
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)  # Ensure compatibility with `gymnasium`'s `reset`
        self.current_step = 0
        self.balance = self.initial_balance
        self.shares_held = 0
        self.account_history = [self.initial_balance]
        return self._next_observation(), {}

    def _next_observation(self, window_size=10):
        """
        Get the observation values for the current and past `window_size` steps
        """
        start = max(0, self.current_step - window_size)
        end = self.current_step + 1
        # Use all technical indicators as observation values
        obs = self.df.iloc[start:end][[
            "Open", "High", "Low", "Close", "Volume", "SMA_20", "SMA_50", "EMA_20", "EMA_50", 
            "MACD", "MACD_signal", "MACD_hist", "RSI_14", "BB_upper", 
            "BB_middle", "BB_lower", "ATR_14", "MFI_14", "slowk", "slowd"
        ]].values
        # If the available data is smaller than the window size, pad it accordingly
        if obs.shape[0] < window_size:
            pad_size = window_size - obs.shape[0]
            obs = np.pad(obs, ((pad_size, 0), (0, 0)), 'edge')  # Pad only at the beginning
        elif obs.shape[0] > window_size:
            obs = obs[-window_size:]  # If data exceeds window size, take the last `window_size` values

        return obs.astype(np.float32)

    
    def step(self, action):
        prev_price = self.df.iloc[self.current_step]["Close"]
        prev_value = self.balance + self.shares_held * prev_price  # Calculate total asset value from the previous step
        self.current_step += 1
        terminated = self.current_step >= len(self.df) - 1
        truncated = False

        if action == 1:  # Buy
            shares_bought = self.balance // prev_price
            self.shares_held += shares_bought
            self.balance -= shares_bought * prev_price
        elif action == 2 and self.shares_held > 0:  # Sell
            self.balance += self.shares_held * prev_price
            self.shares_held = 0

        current_price = self.df.iloc[self.current_step]["Close"]
        current_value = self.balance + self.shares_held * current_price  # Current total asset value
        reward = current_value - prev_value  # Reward based on asset value change

        # Additional reward factor: Encourage holding positions longer
        reward -= 0.1 * (action == 0)  # Encourage holding to avoid excessive trading

        # Introduce penalty to prevent excessive losses
        if current_value < prev_value * 0.95:
            reward -= 5  # Penalty for significant loss

        # Introduce time discount factor to encourage long-term holding
        gamma = 0.99
        future_reward = self.balance + self.shares_held * current_price
        reward = gamma * future_reward + (1 - gamma) * reward

        obs = self._next_observation()
        return obs, reward, terminated, truncated, {}

    def render(self):
        """
        Print current state information
        """
        print(f"Step: {self.current_step}, Balance: {self.balance}, Held Shares: {self.shares_held}, Total Value: {self.balance + self.shares_held * self.df.iloc[self.current_step]['Close']}")


In [5]:
# Training Code (Cell 2)
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym

# Read data
df = pd.read_csv("/home/jesse/Projects/RL_Testing/Q_Learning/Testing/AAPL_full.csv", index_col="Date", parse_dates=True)

# Create the environment using `gymnasium.make`
env = make_vec_env(lambda: StockTradingEnv(df), n_envs=1)

# Train the DQN agent
model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=1000000)

# Save the trained model
model.save("Q3_trading_model")


Using cuda device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.11e+04 |
|    ep_rew_mean      | 5.19e+09 |
|    exploration_rate | 0.58     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 877      |
|    time_elapsed     | 50       |
|    total_timesteps  | 44228    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 7e+05    |
|    n_updates        | 11031    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.11e+04 |
|    ep_rew_mean      | 3.83e+09 |
|    exploration_rate | 0.16     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 835      |
|    time_elapsed     | 105      |
|    total_timesteps  | 88456    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.03e+06 |
| 

In [6]:
# Load AAPL stock data
aapl_df = pd.read_csv("/home/jesse/Projects/RL_Testing/Q_Learning/Training/NVDA_full.csv")

# Create the stock trading environment
env = StockTradingEnv(aapl_df)
obs, _ = env.reset()  # `gymnasium`'s `reset` now returns (obs, info)

# Run the trained model on the AAPL dataset
for _ in range(len(aapl_df)):
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, _ = env.step(action)  # `gymnasium` requires 5 return values
    env.render()
    if terminated or truncated:
        break


Step: 1, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Value: 10428.72532965243
Step: 2, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Value: 10683.983598649502
Step: 3, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Value: 10804.44516040385
Step: 4, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Value: 10679.185980409386
Step: 5, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Value: 10597.278492271898
Step: 6, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Value: 10640.631045460703
Step: 7, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Value: 11117.505685210226
Step: 8, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Value: 11762.992058143016
Step: 9, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Value: 11729.292450666424
Step: 10, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Value: 12235.06907965243
Step: 11, Balance: 0.08140654862108931, Held Shares: 57803.0, Total Valu