In [46]:
import gymnasium as gym
import gym_anytrading
from gym_anytrading.envs import TradingEnv, StocksEnv, Actions, Positions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

In [75]:
# load data
t = "A"
df = pd.read_csv(f"data/{t}.csv")
print(df.head())

   price   volume    sma
0  37.35  3138230  44.90
1  37.31  2044994  44.95
2  36.22  2235402  45.01
3  37.35  2123003  45.06
4  35.77  2015776  45.12


In [76]:
# setup env
def mprocess_data(env):
    start = env.frame_bound[0] - env.window_size
    end = env.frame_bound[1]
    prices = env.df.loc[:, "price"].to_numpy()[start:end]
    signal_features = env.df.loc[:, ["price", "volume", "sma"]].to_numpy()[start:end]
    return prices, signal_features


class MStocksEnv(StocksEnv):
    _process_data = mprocess_data


env = MStocksEnv(df=df, window_size=200, frame_bound=(200, len(df)))

In [42]:
print("env information:")
print("> shape:", env.unwrapped.shape)
print("> df.shape:", env.unwrapped.df.shape)
print("> prices.shape:", env.unwrapped.prices.shape)
print("> signal_features.shape:", env.unwrapped.signal_features.shape)
print("> max_possible_profit:", env.unwrapped.max_possible_profit())

env information:
> shape: (200, 3)
> df.shape: (6097, 3)
> prices.shape: (6097,)
> signal_features.shape: (6097, 3)
> max_possible_profit: 1.928644751063614e+19


In [79]:
# Hyperparameters
NUM_EPISODES = 10
BATCH_SIZE = 64
GAMMA = 0.99
LEARNING_RATE = 1e-3
TARGET_UPDATE_FREQ = 10  # episodes
MEMORY_CAPACITY = 10000
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.995

In [70]:
# Define the Q-Network
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        # A simple MLP; you can customize this architecture
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )
    
    def forward(self, x):
        return self.fc(x)

In [71]:
# Replay Memory
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)
    
    def push(self, transition):
        self.memory.append(transition)
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [72]:
# Preprocess environment observation
def preprocess_obs(obs):
    # Assume obs is a 2D array with shape (window_size, num_features)
    # Flatten the observation to a 1D vector
    return np.array(obs).flatten()

In [None]:
# Main Training Loop
def train():
    obs_space_shape = env.observation_space.shape  # e.g. (window_size, num_features)
    input_dim = np.prod(obs_space_shape)  # flatten the observation
    output_dim = env.action_space.n         # 2 actions: Sell (0) and Buy (1)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    policy_net = DQN(input_dim, output_dim).to(device)
    target_net = DQN(input_dim, output_dim).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)
    memory = ReplayMemory(MEMORY_CAPACITY)
    epsilon = EPS_START

    for episode in range(NUM_EPISODES):
        obs, info = env.reset(seed=episode)
        obs = preprocess_obs(obs)
        total_reward = 0
        done = False

        while not done:
            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(obs).unsqueeze(0).to(device)
                    q_values = policy_net(state_tensor)
                    action = q_values.argmax().item()

            next_obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            next_obs_flat = preprocess_obs(next_obs)
            total_reward += reward

            # Save transition
            memory.push((obs, action, reward, next_obs_flat, done))
            obs = next_obs_flat

            # Perform a training step
            if len(memory) >= BATCH_SIZE:
                transitions = memory.sample(BATCH_SIZE)
                batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = zip(*transitions)

                batch_obs = torch.FloatTensor(batch_obs).to(device)
                batch_actions = torch.LongTensor(batch_actions).unsqueeze(1).to(device)
                batch_rewards = torch.FloatTensor(batch_rewards).unsqueeze(1).to(device)
                batch_next_obs = torch.FloatTensor(batch_next_obs).to(device)
                batch_dones = torch.FloatTensor(batch_dones).unsqueeze(1).to(device)

                # Compute current Q values
                current_q = policy_net(batch_obs).gather(1, batch_actions)

                # Compute target Q values using the target network
                with torch.no_grad():
                    max_next_q = target_net(batch_next_obs).max(1, keepdim=True)[0]
                    target_q = batch_rewards + GAMMA * max_next_q * (1 - batch_dones)

                # Compute loss (MSE)
                loss = nn.MSELoss()(current_q, target_q)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # Decay epsilon
        epsilon = max(EPS_END, epsilon * EPS_DECAY)
        print(f"Episode {episode+1}/{NUM_EPISODES}, Total Reward: {total_reward:.2f}, Epsilon: {epsilon:.3f}")

        # Update the target network periodically
        if (episode + 1) % TARGET_UPDATE_FREQ == 0:
            target_net.load_state_dict(policy_net.state_dict())

    # Save the trained model if desired
    torch.save(policy_net.state_dict(), "dqn_trading_model.pth")
    env.close()

In [81]:
if __name__ == "__main__":
    train()

Episode 1/10, Total Reward: -1.22, Epsilon: 0.995
Episode 2/10, Total Reward: 1.21, Epsilon: 0.990
Episode 3/10, Total Reward: 32.52, Epsilon: 0.985
Episode 4/10, Total Reward: 30.76, Epsilon: 0.980
Episode 5/10, Total Reward: 152.89, Epsilon: 0.975
Episode 6/10, Total Reward: 12.34, Epsilon: 0.970
Episode 7/10, Total Reward: 135.65, Epsilon: 0.966
Episode 8/10, Total Reward: 47.42, Epsilon: 0.961
Episode 9/10, Total Reward: 64.32, Epsilon: 0.956
Episode 10/10, Total Reward: 94.06, Epsilon: 0.951
