In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)


Sat Nov  4 21:05:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    26W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


# Define a custom environment
class CustomEnvironment:
    def __init__(self):
        self.imba = 200
        self.state = np.zeros(3) #0 (short), 1 (neutral), 2 (long).
        self.action_space = 3 # 0 Buy, 1 Sell, 2 Hold

    def reset(self):
        self.state = np.zeros(3)
        return self.state

    def step(self, action):
        if action == 0:
            self.state[0] += 1
            self.state[2] += 1
            reward = self.imba
        elif action == 1:
            self.state[0] -= 1
            self.state[2] -= 1
            reward = -1 * (self.imba/100)
        else:
            reward = 0


        done = False
        return self.state, reward, done

# Define a DQN agent using PyTorch
class DQNAgent:
    def __init__(self, state_dim, action_dim, learning_rate=0.001, discount_factor=0.99, epsilon=0.1):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon

        self.model = self.build_model()
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def build_model(self):
        model = nn.Sequential(
            nn.Linear(self.state_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, self.action_dim)
        )
        return model

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_dim)
        else:
            q_values = self.model(torch.tensor(state, dtype=torch.float32))
            return torch.argmax(q_values).item()

    def train(self, state, action, reward, next_state, done):
        state_tensor = torch.tensor(state, dtype=torch.float32)
        next_state_tensor = torch.tensor(next_state, dtype=torch.float32)

        q_values = self.model(state_tensor)
        with torch.no_grad():
            q_values_next = self.model(next_state_tensor)
        target = q_values.clone()
        # print("this is target", target[0][0][0])
        # print("this is target", target[0][0][1])
        # print("this is target", target[0][0][2])

        if done:
            target[0][action] = reward
        else:
            target[0][0][action] = reward + self.discount_factor * torch.max(q_values_next)

        loss = nn.MSELoss()(q_values, target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

def main():
    # Create the environment and agent
    env = CustomEnvironment()
    state_dim = 3
    action_dim = env.action_space
    agent = DQNAgent(state_dim, action_dim)

    # Training loop
    num_episodes = 1000
    max_steps = 200  # Maximum number of steps per episode

    for episode in range(num_episodes):
        state = np.array([[env.reset()]])
        total_reward = 0
        step = 0

        while step < max_steps:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)

            next_state = np.array([[next_state]])
            agent.train(state, action, reward, next_state, done)

            total_reward += reward
            state = next_state
            step += 1

            if done:
                break

        print(f"Episode {episode + 1}, Total Reward: {total_reward}")

    # Test the agent
    state = np.array([[env.reset()]])
    total_reward = 0
    mysteps = 0
    while mysteps < 1000:
        action = agent.choose_action(state)
        next_state, reward, done = env.step(action)

        total_reward += reward
        state = np.array([[next_state]])

        mysteps += 1
        if done:
            break

    print(f"Test - Total Reward: {total_reward}")

if __name__ == "__main__":
    main()


Episode 1, Total Reward: 11588.0
Episode 2, Total Reward: 36586.0
Episode 3, Total Reward: 38194.0
Episode 4, Total Reward: 37392.0
Episode 5, Total Reward: 36188.0
Episode 6, Total Reward: 38186.0
Episode 7, Total Reward: 38388.0
Episode 8, Total Reward: 37786.0
Episode 9, Total Reward: 37986.0
Episode 10, Total Reward: 36584.0
Episode 11, Total Reward: 37394.0
Episode 12, Total Reward: 36582.0
Episode 13, Total Reward: 36986.0
Episode 14, Total Reward: 36982.0
Episode 15, Total Reward: 36580.0
Episode 16, Total Reward: 37788.0
Episode 17, Total Reward: 37592.0
Episode 18, Total Reward: 36986.0
Episode 19, Total Reward: 37786.0
Episode 20, Total Reward: 37786.0
Episode 21, Total Reward: 37180.0
Episode 22, Total Reward: 36984.0
Episode 23, Total Reward: 37990.0
Episode 24, Total Reward: 36782.0
Episode 25, Total Reward: 38390.0
Episode 26, Total Reward: 2582.0
Episode 27, Total Reward: 1388.0
Episode 28, Total Reward: 1786.0
Episode 29, Total Reward: 1996.0
Episode 30, Total Reward: 1