In [5]:
import gymnasium as gym
import pandas as pd
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env
from sklearn.preprocessing import StandardScaler

# ✅ 1️⃣ Load stock data and preprocess
df = pd.read_csv("/home/jesse/Projects/RL_Testing/SAC_Testing/TandT/nvidia_stock_with_indicators.csv",
                 index_col="Date", parse_dates=True)

# Normalize the data to prevent gradient explosion
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])  

# ✅ 2️⃣ Define the stock trading environment
class StockTradingEnv(gym.Env):
    def __init__(self, df, window_size=10, initial_balance=10000, transaction_cost=0.001):
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.transaction_cost = transaction_cost  # Transaction cost factor
        self.current_step = window_size

        # Account variables
        self.balance = initial_balance
        self.shares_held = 0
        self.total_asset_value = initial_balance

        # Observation space
        self.feature_columns = [col for col in df.columns if col != "Date"]
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, len(self.feature_columns)), dtype=np.float32
        )

        # Action space: -1 (sell) to 1 (buy)
        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)

    def _next_observation(self):
        return self.df.iloc[self.current_step - self.window_size:self.current_step][self.feature_columns].values.astype(np.float32)

    def step(self, action):
        prev_asset_value = self.total_asset_value
        current_price = self.df.iloc[self.current_step]["Close"]

        # Compute trade amount considering available balance
        trade_amount = action[0] * self.balance  
        if trade_amount > 0:  # Buy
            shares_bought = trade_amount / current_price
            self.shares_held += shares_bought
            self.balance -= trade_amount
        elif trade_amount < 0:  # Sell
            shares_sold = min(abs(trade_amount) / current_price, self.shares_held)
            self.shares_held -= shares_sold
            self.balance += shares_sold * current_price

        # Update total asset value
        self.total_asset_value = self.balance + (self.shares_held * current_price)
        
        # Compute reward (return - transaction cost)
        reward = (self.total_asset_value - prev_asset_value) / prev_asset_value
        cost = self.transaction_cost * abs(action[0]) / self.total_asset_value  
        reward -= cost  

        self.current_step += 1
        terminated = self.current_step >= len(self.df) - 1
        truncated = False  

        return self._next_observation(), reward, terminated, truncated, {}

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.shares_held = 0
        self.total_asset_value = self.initial_balance
        return self._next_observation(), {}

    def render(self):
        print(f"Step: {self.current_step}, Balance: {self.balance:.2f}, Portfolio Value: {self.total_asset_value:.2f}")

# ✅ 3️⃣ Create a vectorized environment for training
def create_env():
    return StockTradingEnv(df)  

env = make_vec_env(create_env, n_envs=1)

print("✅ Trading environment successfully created!")

✅ Trading environment successfully created!


In [None]:
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env
import pandas as pd

# ✅ 4️⃣ Train SAC agent
policy_kwargs = dict(net_arch=[512, 512, 256])  # Deeper network for better feature extraction
model = SAC(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=3e-5,  # Optimized learning rate
    ent_coef=0.01,  # Controls exploration-exploitation tradeoff
    policy_kwargs=policy_kwargs,
    batch_size=512,  # Larger batch size for efficient GPU utilization
    tensorboard_log="./sac_logs/"
)

# ✅ 5️⃣ Train for 200,000 steps
model.learn(total_timesteps=200000)

# ✅ 6️⃣ Save trained model
model.save("sac_NVDA")

# ✅ 7️⃣ Evaluate after training
print("🎉 Training complete! Model saved as 'sac_NVDA_10' 🎉")

Using cuda device
Logging to ./sac_logs/SAC_7
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.52e+03 |
|    ep_rew_mean     | -27.1    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 70       |
|    time_elapsed    | 257      |
|    total_timesteps | 18092    |
| train/             |          |
|    actor_loss      | 2.95     |
|    critic_loss     | 0.0117   |
|    ent_coef        | 0.01     |
|    learning_rate   | 3e-05    |
|    n_updates       | 17991    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.52e+03 |
|    ep_rew_mean     | -41.7    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 67       |
|    time_elapsed    | 535      |
|    total_timesteps | 36184    |
| train/             |          |
|    actor_loss      | 2.58     |
|    critic_loss     | 0.129    |
| 

In [7]:
import pandas as pd

# Load stock data
msft_df = pd.read_csv("/home/jesse/Projects/RL_Testing/SAC_Testing/TandT/microsoft_stock_with_indicators.csv")

# Initialize the trading environment
env = StockTradingEnv(msft_df)
obs, _ = env.reset()  # `gymnasium` requires reset() to return (obs, info)

# Run the trained model in the environment
for _ in range(len(msft_df)):
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, _ = env.step(action)  # `gymnasium` returns 5 values
    env.render()
    
    if terminated or truncated:
        break


Step: 11, Balance: 0.00, Portfolio Value: 10000.00
Step: 12, Balance: 0.00, Portfolio Value: 9119.38
Step: 13, Balance: 0.00, Portfolio Value: 9197.66
Step: 14, Balance: 0.00, Portfolio Value: 9432.50
Step: 15, Balance: 0.00, Portfolio Value: 9236.79
Step: 16, Balance: 0.00, Portfolio Value: 8845.41
Step: 17, Balance: 0.00, Portfolio Value: 8786.70
Step: 18, Balance: 0.00, Portfolio Value: 8904.11
Step: 19, Balance: 0.00, Portfolio Value: 8845.41
Step: 20, Balance: 0.00, Portfolio Value: 8571.43
Step: 21, Balance: 0.00, Portfolio Value: 8845.41
Step: 22, Balance: 0.00, Portfolio Value: 8649.72
Step: 23, Balance: 0.00, Portfolio Value: 8767.13
Step: 24, Balance: 0.00, Portfolio Value: 9099.82
Step: 25, Balance: 0.00, Portfolio Value: 8962.82
Step: 26, Balance: 0.00, Portfolio Value: 9080.25
Step: 27, Balance: 0.00, Portfolio Value: 9001.97
Step: 28, Balance: 0.00, Portfolio Value: 8728.00
Step: 29, Balance: 0.00, Portfolio Value: 8923.68
Step: 30, Balance: 0.00, Portfolio Value: 8297.47

In [8]:
import pandas as pd

# Load stock data
aapl_df = pd.read_csv("/home/jesse/Projects/RL_Testing/SAC_Testing/TandT/apple_stock_with_indicators.csv")

# Initialize the trading environment
env = StockTradingEnv(aapl_df)
obs, _ = env.reset()  # `gymnasium` requires reset() to return (obs, info)

# Run the trained model in the environment
for _ in range(len(aapl_df)):
    action, _states = model.predict(obs)
    obs, reward, terminated, truncated, _ = env.step(action)  # `gymnasium` returns 5 values
    env.render()
    
    if terminated or truncated:
        break



Step: 11, Balance: 0.00, Portfolio Value: 10000.00
Step: 12, Balance: 0.00, Portfolio Value: 9958.51
Step: 13, Balance: 0.00, Portfolio Value: 10041.49
Step: 14, Balance: 0.00, Portfolio Value: 10331.93
Step: 15, Balance: 0.00, Portfolio Value: 10622.36
Step: 16, Balance: 0.00, Portfolio Value: 10871.31
Step: 17, Balance: 0.00, Portfolio Value: 10788.33
Step: 18, Balance: 0.00, Portfolio Value: 10414.91
Step: 19, Balance: 0.00, Portfolio Value: 9958.51
Step: 20, Balance: 0.00, Portfolio Value: 9917.02
Step: 21, Balance: 0.00, Portfolio Value: 9460.54
Step: 22, Balance: 0.00, Portfolio Value: 10000.00
Step: 23, Balance: 0.00, Portfolio Value: 10041.49
Step: 24, Balance: 0.00, Portfolio Value: 9668.00
Step: 25, Balance: 0.00, Portfolio Value: 9128.61
Step: 26, Balance: 0.00, Portfolio Value: 9543.52
Step: 27, Balance: 0.00, Portfolio Value: 9668.00
Step: 28, Balance: 0.00, Portfolio Value: 10207.46
Step: 29, Balance: 0.00, Portfolio Value: 10207.46
Step: 30, Balance: 0.00, Portfolio Valu

In [9]:
import numpy as np

# load model
model = SAC.load("sac_NVDA")

# Validating
obs, _ = env.reset()
done = False
total_reward = 0
while not done:
    action, _ = model.predict(obs, deterministic=True) 
    obs, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    total_reward += reward

print(f"Final Porfits: {total_reward}")

Final Porfits: 11.573875182873147
