In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load
data = pd.read_csv(r'/home/masih/Desktop/btc-usd-max.csv', parse_dates=['date'], index_col='date')

# Split
split_idx = int(len(data) * 0.8)
train_data = data.iloc[:split_idx]
test_data = data.iloc[split_idx:]

# Normalize features
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_data[['price', 'total_volume']])
test_scaled = scaler.transform(test_data[['price', 'total_volume']])

In [14]:
# Cell 2: Env Design

class TradingEnv:
    def __init__(self, data, window_size):
        self.data = data
        self.window_size = window_size
        self.current_step = window_size
        self.max_step = len(data) - 1

        # Init portfo
        self.cash = 10000  # Starting cash
        self.btc = 0.0
        self.total_value = self.cash

    def reset(self):
        self.current_step = self.window_size
        self.cash = 10000
        self.btc = 0.0
        self.total_value = self.cash
        return self._get_state()

    def _get_state(self):
        # Get the past 'window_size' days' data
        state = self.data[self.current_step - self.window_size : self.current_step]
        return state.flatten()  # Flatten to 1D array for the neural net

    def step(self, action):
        assert action in [0, 1, 2], "Invalid action"

        # Get current price (scaled) and actual price (unscaled)
        scaled_price = self.data[self.current_step, 0]
        actual_price = scaler.inverse_transform(self.data[self.current_step].reshape(1, -1))[0, 0]

        prev_value = self.total_value
        reward = 0
        done = False

        # Execute action
        if action == 0:  # Buy
            if self.cash > 0:
                # Use 10% of cash to buy BTC
                investment = 0.1 * self.cash
                self.btc += investment / actual_price
                self.cash -= investment
        elif action == 1:  # Sell
            if self.btc > 0:
                # Sell 10% of BTC
                sell_amount = 0.1 * self.btc
                self.cash += sell_amount * actual_price
                self.btc -= sell_amount

        # Update total portfolio value
        self.total_value = self.cash + self.btc * actual_price

        # Calculate reward as the change in portfolio value
        reward = self.total_value - prev_value

        # Move to next step
        self.current_step += 1
        if self.current_step > self.max_step:
            done = True

        next_state = self._get_state() if not done else None
        return next_state, reward, done

In [26]:
# dqn_agent_pt.py
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

# Define a simple fully connected Q-network
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, state_size, action_size, device='cpu'):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = []
        self.gamma = 0.95           # Discount factor
        self.epsilon = 1.0          # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.device = device
        self.model = DQN(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.model(state_tensor)
        return int(torch.argmax(q_values[0]).item())

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        states, targets = [], []
        for state, action, reward, next_state, done in minibatch:
            state_tensor = torch.FloatTensor(state).to(self.device)
            # Get current Q-values
            with torch.no_grad():
                target = self.model(state_tensor.unsqueeze(0)).cpu().numpy()[0]
            if done:
                target[action] = reward
            else:
                next_state_tensor = torch.FloatTensor(next_state).to(self.device)
                with torch.no_grad():
                    next_q = self.model(next_state_tensor.unsqueeze(0))
                target[action] = reward + self.gamma * torch.max(next_q).item()
            states.append(state)
            targets.append(target)
        states_tensor = torch.FloatTensor(np.array(states)).to(self.device)
        targets_tensor = torch.FloatTensor(np.array(targets)).to(self.device)

        self.optimizer.zero_grad()
        outputs = self.model(states_tensor)
        loss = self.criterion(outputs, targets_tensor)
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Example usage (stub environment loop)
if __name__ == "__main__":
    state_size = 4   # Example state dimension (e.g., for CartPole)
    action_size = 2  # Example number of actions
    agent = DQNAgent(state_size, action_size, device='cpu')
    episodes = 5
    batch_size = 32

    # Dummy environment loop for demonstration
    for e in range(episodes):
        state = np.random.rand(state_size)
        done = False
        step = 0
        while not done and step < 50:
            action = agent.act(state)
            next_state = np.random.rand(state_size)
            reward = 1.0
            done = (step == 49)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            step += 1
        agent.replay(batch_size)
        print(f"Episode {e+1}/{episodes} completed. Epsilon: {agent.epsilon:.4f}")


Episode 1/5 completed. Epsilon: 0.9950
Episode 2/5 completed. Epsilon: 0.9900
Episode 3/5 completed. Epsilon: 0.9851
Episode 4/5 completed. Epsilon: 0.9801
Episode 5/5 completed. Epsilon: 0.9752


In [28]:
# Hyperparameters
WINDOW_SIZE = 10
BATCH_SIZE = 32
EPISODES = 50

# Initialize environment and agent
env = TradingEnv(train_scaled, WINDOW_SIZE)
state_size = WINDOW_SIZE * 2  # 2 features (price and volume)
action_size = 3
agent = DQNAgent(state_size, action_size)

# Training loop
for episode in range(EPISODES):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        total_reward += reward
        state = next_state

        # Train with replay
        agent.replay(BATCH_SIZE)

    print(f"Episode: {episode+1}/{EPISODES}, Total Reward: ${total_reward:.2f}, Epsilon: {agent.epsilon:.2f}")

Episode: 1/50, Total Reward: $651315.50, Epsilon: 0.01
Episode: 2/50, Total Reward: $41882.73, Epsilon: 0.01
Episode: 3/50, Total Reward: $72409.25, Epsilon: 0.01
Episode: 4/50, Total Reward: $440353.17, Epsilon: 0.01
Episode: 5/50, Total Reward: $792.54, Epsilon: 0.01
Episode: 6/50, Total Reward: $208669.01, Epsilon: 0.01
Episode: 7/50, Total Reward: $70120.39, Epsilon: 0.01
Episode: 8/50, Total Reward: $167700.22, Epsilon: 0.01
Episode: 9/50, Total Reward: $292444.95, Epsilon: 0.01
Episode: 10/50, Total Reward: $161570.85, Epsilon: 0.01
Episode: 11/50, Total Reward: $322406.28, Epsilon: 0.01
Episode: 12/50, Total Reward: $337588.21, Epsilon: 0.01
Episode: 13/50, Total Reward: $180410.69, Epsilon: 0.01
Episode: 14/50, Total Reward: $423283.73, Epsilon: 0.01
Episode: 15/50, Total Reward: $213636.80, Epsilon: 0.01
Episode: 16/50, Total Reward: $3725351.29, Epsilon: 0.01
Episode: 17/50, Total Reward: $201238.96, Epsilon: 0.01
Episode: 18/50, Total Reward: $4312623.91, Epsilon: 0.01
Episo

In [1]:
test_env = TradingEnv(test_scaled, WINDOW_SIZE)
state = test_env.reset()
total_reward = 0
done = False

while not done:
    action = agent.act(state)
    next_state, reward, done = test_env.step(action)
    total_reward += reward
    state = next_state

print(f"Test Total Reward: ${total_reward:.2f}")

NameError: name 'TradingEnv' is not defined