In [1]:
#ppo algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with all metrics
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'AMZN_TRAINING.csv'
    test_file = 'AMZN_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the PPO model and train
    model = PPO("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=100000)

    # Test the model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)

    # Generate report for the training session
    env_train.generate_report()

    # Calculate and display metrics for the training period
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display metrics for the testing period
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1083 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 898         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013525862 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -9.45       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00473     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00942    |
|    value_loss         

In [2]:
#dqn algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with all metrics
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'AMZN_TRAINING.csv'
    test_file = 'AMZN_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the DQN model and train
    model = DQN("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=100000)

    # Test the model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)

    # Generate report for the training session
    env_train.generate_report()

    # Calculate and display metrics for the training period
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display metrics for the testing period
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.44e+03 |
|    ep_rew_mean      | -6.48    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 701      |
|    time_elapsed     | 42       |
|    total_timesteps  | 29780    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000273 |
|    n_updates        | 7419     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.44e+03 |
|    ep_rew_mean      | -8.88    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 712      |
|    time_elapsed     | 83       |
|    total_timesteps  | 59560    |
| train/              |        

In [5]:
#singleagent a2c
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import A2C
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = -0.1  # Small penalty for each step to discourage excessive holding
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
                reward += 0.5  # Reward for taking a position
            elif self.position == -1:  # Close short position
                profit = self.entry_price - current_price
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                reward += profit * 10  # Boost reward for profitable exit
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
                reward += 0.5  # Reward for taking a position
            elif self.position == 1:  # Close long position
                profit = current_price - self.entry_price
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                reward += profit * 10  # Boost reward for profitable exit
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with action logging for debugging
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'AMZN_TRAINING.csv'
    test_file = 'AMZN_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the A2C model and train with more timesteps
    model = A2C("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=1000000)  # Increased timesteps

    # Testing on the training data with action logging
    obs, _ = env_train.reset()
    done = False
    print("\n--- Training Session ---")
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)
        # Log each action and reward for debugging
        print(f"Step: {env_train.current_step}, Action: {action}, Reward: {reward}")

    # Generate report and metrics for the training session
    env_train.generate_report()
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Testing on the testing data with action logging
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    print("\n--- Testing Session ---")
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)
        # Log each action and reward for debugging
        print(f"Step: {env_test.current_step}, Action: {action}, Reward: {reward}")

    # Generate report and metrics for the testing session
    env_test.generate_report()
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 473      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.844   |
|    explained_variance | -0.103   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.354   |
|    value_loss         | 0.322    |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 508      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.755   |
|    explained_variance | -0.00136 |
|    learning_rate      | 0.0007   |
|    n_updates    

In [3]:
#ensemble learning single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO, DQN, A2C
from sklearn.preprocessing import StandardScaler
from collections import Counter

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Ensemble model function
def ensemble_predict(actions):
    # Convert numpy arrays to integers for each action
    actions = [int(action) for action in actions]
    # Perform a majority vote among the actions (hold, buy, sell)
    action_counts = Counter(actions)
    return action_counts.most_common(1)[0][0]

# Train and evaluate the ensemble model
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'AMZN_TRAINING.csv'
    test_file = 'AMZN_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize each model and train them separately
    ppo_model = PPO("MlpPolicy", env_train, verbose=1)
    dqn_model = DQN("MlpPolicy", env_train, verbose=1)
    a2c_model = A2C("MlpPolicy", env_train, verbose=1)

    # Train each model
    ppo_model.learn(total_timesteps=50000)
    dqn_model.learn(total_timesteps=50000)
    a2c_model.learn(total_timesteps=50000)

    # Test the ensemble model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        # Get predictions from each model
        ppo_action, _ = ppo_model.predict(obs)
        dqn_action, _ = dqn_model.predict(obs)
        a2c_action, _ = a2c_model.predict(obs)

        # Aggregate the actions through majority voting
        final_action = ensemble_predict([ppo_action, dqn_action, a2c_action])

        # Step the environment with the final action
        obs, reward, done, truncated, info = env_train.step(final_action)

    # Calculate and display training metrics
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the ensemble model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        # Get predictions from each model
        ppo_action, _ = ppo_model.predict(obs)
        dqn_action, _ = dqn_model.predict(obs)
        a2c_action, _ = a2c_model.predict(obs)

        # Aggregate the actions through majority voting
        final_action = ensemble_predict([ppo_action, dqn_action, a2c_action])

        # Step the environment with the final action
        obs, reward, done, truncated, info = env_test.step(final_action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display testing metrics
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 854  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 655          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0075964513 |
|    clip_fraction        | 0.0423       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    explained_variance   | 0.124  