In [3]:
import torch
import torch.nn as nn
from collections import deque
import random

import torch.nn.functional as F
import numpy as np

In [None]:
base_dir = '../Data_Generation/Data_files/'
base_fp = base_dir + 'subset_sl2_Z.csv'
test_fp = base_dir + 'subset_test_rows_SL2Z_Q_learn.csv'
train_fp = base_dir + 'subset_train_rows_SL2Z_Q_learn.csv'

### Network Architecture
$4\times1\to$ hidden layers $\to 4\times 1$

In [None]:
class DeepQModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(DeepQModel, self).__init__()
        self.stack = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Linear(16, output_size),
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.stack(x)

In [None]:
class DeepQAgent:
    def __init__(self, learning_rate: float, gamma: float, epsilon: float, discount_factor: float,
                 batch_size: int, memory_capacity: int):
        # Initialize main an target models and set weights to be equivalent
        self.mainModel = DeepQModel(4, 4)
        self.targetModel = DeepQModel(4, 4)
        
        # TODO: check if this works
        self.copy_main_to_target()

        # Initialize Hyperparameters
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.discount_factor = discount_factor
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity

        # Experience Replay Buffers
        self.memory = deque()
       
    def copy_main_to_target(self):
        self.targetModel.load_state_dict(self.mainModel.state_dict())

    def replay(self):
        # Ensure there is enough memory for a full batch
        if len(self.memory) <  self.batch_size:
            return
        
        # Random sample a batch_size's worth of memory
        minibatch = random.sample(self.memory, self.batch_size)

        # Vectorizing data
        states, actions, rewards, next_states, dones = zip(*minibatch)
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)
        
        # Bellman Equation: (Reward + discount_factor * state_prediction)
        # len(states) x 1
        target_predictions = self.targetModel.forward(states).max(dim=1)
        bellmans = rewards + self.discount_factor * target_predictions
        next_q = self.targetModel.forward(next_states).max(dim=1)

        F.mse_loss(next_q, bellmans)
        
    def remember(self, state, action, reward, next_state, done: bool):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > self.memory_capacity:
            self.memory.popleft()



In [None]:
class DeepQAgentTrainer: 
    def __init__(self, agent: DeepQAgent, batch_size) -> None:
        self.batch_size = batch_size
        self.agent = agent
        pass