In [1]:
import gymnasium as gym
import pandas as pd
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env
from sklearn.preprocessing import StandardScaler

# Load dataframe
df = pd.read_csv("/home/jesse/Projects/RL_Testing/SAC_Testing/TandT/nvidia_stock_with_indicators.csv",
                 index_col="Date", parse_dates=True)

# Normalization to prevent gradient explosion
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])  

# Define trading environment
class StockTradingEnv(gym.Env):
    def __init__(self, df, window_size=10, initial_balance=10000, transaction_cost=0.001):
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.transaction_cost = transaction_cost  # cost of transaction
        self.current_step = window_size

        # Account variables
        self.balance = initial_balance
        self.shares_held = 0
        self.total_asset_value = initial_balance

        # Observation space
        self.feature_columns = [col for col in df.columns if col != "Date"]
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, len(self.feature_columns)), dtype=np.float32
        )

        # Action space: -1 (sell) to 1 (buy)
        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)

    def _next_observation(self):
        return self.df.iloc[self.current_step - self.window_size:self.current_step][self.feature_columns].values.astype(np.float32)

    def step(self, action):
        prev_asset_value = self.total_asset_value
        current_price = self.df.iloc[self.current_step]["Close"]

        # ompute trade amount considering available balance
        trade_amount = action[0] * self.balance  
        if trade_amount > 0:  # buy
            shares_bought = trade_amount / current_price
            self.shares_held += shares_bought
            self.balance -= trade_amount
        elif trade_amount < 0:  # sell
            shares_sold = min(abs(trade_amount) / current_price, self.shares_held)
            self.shares_held -= shares_sold
            self.balance += shares_sold * current_price

        # Update total asset value
        self.total_asset_value = self.balance + (self.shares_held * current_price)
        
        # Compute reward (return - transaction cost) 
        reward = (self.total_asset_value - prev_asset_value) / prev_asset_value
        cost = self.transaction_cost * abs(action[0]) / self.total_asset_value 
        reward -= cost # I need to work on this.
    

        self.current_step += 1
        terminated = self.current_step >= len(self.df) - 1
        truncated = False  

        return self._next_observation(), reward, terminated, truncated, {}

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.shares_held = 0
        self.total_asset_value = self.initial_balance
        return self._next_observation(), {}

    def render(self):
        print(f"Step: {self.current_step}, Balance: {self.balance}, Portfolio Value: {self.total_asset_value}")

# create a vertorized environment for training
def create_env():
    return StockTradingEnv(df)  

env = make_vec_env(create_env, n_envs=1)

print("✅ Trading environment successfully created!")


✅ Trading environment successfully created!


In [2]:
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env
import pandas as pd

# ✅ 4️⃣ Train the SAC Agent
policy_kwargs = dict(net_arch=[512, 512])  # more neurons and more parameters
model = SAC(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=1e-6,  # Optimized LR to avoid gradient explosion
    ent_coef=0.01,  # Controls exploration-exploitation tradeoff
    policy_kwargs=policy_kwargs,
    tensorboard_log="./sac_logs/"
)

# ✅ 5️⃣ Train for 5,000 steps
model.learn(total_timesteps=20000)

# ✅ 6️⃣ sSave trained model
model.save("sac_NVDA")

# ✅ 7️⃣ Evaluate after training
print("🎉 Training complete! Model saved as 'sac_NVD' 🎉")




Using cuda device
Logging to ./sac_logs/SAC_23
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.52e+03 |
|    ep_rew_mean     | 114      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 102      |
|    time_elapsed    | 177      |
|    total_timesteps | 18092    |
| train/             |          |
|    actor_loss      | -0.145   |
|    critic_loss     | 0.0248   |
|    ent_coef        | 0.01     |
|    learning_rate   | 1e-06    |
|    n_updates       | 17991    |
---------------------------------
🎉 Training complete! Model saved as 'sac_NVD' 🎉


In [3]:
# Load stock data
aapl_df = pd.read_csv("/home/jesse/Projects/RL_Testing/SAC_Testing/TandT/apple_stock_with_indicators.csv")

# Initialize the trading environment
env = StockTradingEnv(aapl_df)
obs, _ = env.reset()  # `gymnasium` requires reset() to return (obs, info)

# Run the trained model in the environment
for _ in range(len(aapl_df)):
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, _ = env.step(action)  # `gymnasium` returns 5 values
    env.render()
    
    if terminated or truncated:
        break

Step: 11, Balance: 10000.0, Portfolio Value: 10000.0
Step: 12, Balance: 10000.0, Portfolio Value: 10000.0
Step: 13, Balance: 10000.0, Portfolio Value: 10000.0
Step: 14, Balance: 10000.0, Portfolio Value: 10000.0
Step: 15, Balance: 10000.0, Portfolio Value: 10000.0
Step: 16, Balance: 10000.0, Portfolio Value: 10000.0
Step: 17, Balance: 10000.0, Portfolio Value: 10000.0
Step: 18, Balance: 10000.0, Portfolio Value: 10000.0
Step: 19, Balance: 10000.0, Portfolio Value: 10000.0
Step: 20, Balance: 10000.0, Portfolio Value: 10000.0
Step: 21, Balance: 10000.0, Portfolio Value: 10000.0
Step: 22, Balance: 10000.0, Portfolio Value: 10000.0
Step: 23, Balance: 10000.0, Portfolio Value: 10000.0
Step: 24, Balance: 10000.0, Portfolio Value: 10000.0
Step: 25, Balance: 10000.0, Portfolio Value: 10000.0
Step: 26, Balance: 10000.0, Portfolio Value: 10000.0
Step: 27, Balance: 10000.0, Portfolio Value: 10000.0
Step: 28, Balance: 10000.0, Portfolio Value: 10000.0
Step: 29, Balance: 10000.0, Portfolio Value: 1

In [4]:
# Load stock data
msft_df = pd.read_csv("/home/jesse/Projects/RL_Testing/SAC_Testing/TandT/microsoft_stock_with_indicators.csv")

# Initialize the trading environment
env = StockTradingEnv(msft_df)
obs, _ = env.reset()  # `gymnasium` requires reset() to return (obs, info)

# Run the trained model in the environment
for _ in range(len(msft_df)):
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, _ = env.step(action)  # `gymnasium` returns 5 values
    env.render()
    
    if terminated or truncated:
        break

Step: 11, Balance: 10000.0, Portfolio Value: 10000.0
Step: 12, Balance: 10000.0, Portfolio Value: 10000.0
Step: 13, Balance: 10000.0, Portfolio Value: 10000.0
Step: 14, Balance: 10000.0, Portfolio Value: 10000.0
Step: 15, Balance: 10000.0, Portfolio Value: 10000.0
Step: 16, Balance: 10000.0, Portfolio Value: 10000.0
Step: 17, Balance: 10000.0, Portfolio Value: 10000.0
Step: 18, Balance: 10000.0, Portfolio Value: 10000.0
Step: 19, Balance: 10000.0, Portfolio Value: 10000.0
Step: 20, Balance: 10000.0, Portfolio Value: 10000.0
Step: 21, Balance: 10000.0, Portfolio Value: 10000.0
Step: 22, Balance: 10000.0, Portfolio Value: 10000.0
Step: 23, Balance: 10000.0, Portfolio Value: 10000.0
Step: 24, Balance: 10000.0, Portfolio Value: 10000.0
Step: 25, Balance: 10000.0, Portfolio Value: 10000.0
Step: 26, Balance: 10000.0, Portfolio Value: 10000.0
Step: 27, Balance: 10000.0, Portfolio Value: 10000.0
Step: 28, Balance: 10000.0, Portfolio Value: 10000.0
Step: 29, Balance: 10000.0, Portfolio Value: 1

In [5]:
import numpy as np

# Load model
model = SAC.load("sac_NVDA")

# Validating
obs, _ = env.reset()
done = False
total_reward = 0
while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    total_reward += reward

print(f"Final Profits: {total_reward}")

Final Profits: 8.219434032938908
