In [70]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random

import torch.nn.functional as F


import pandas as pd
from matplotlib import pyplot as plt

In [71]:
base_dir = '../Data_Generation/Data_files/labeled_points/'
base_fp = base_dir + 'sl2_Z_2s_train.csv'
test_fp = base_dir + 'subset_test_rows_SL2Z_Q_learn.csv'
train_fp = base_dir + 'subset_train_rows_SL2Z_Q_learn.csv'

### Network Architecture
$4\times1\to$ hidden layers $\to 4\times 1$

In [72]:
class DeepQModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(DeepQModel, self).__init__()
        self.stack = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Linear(16, output_size),
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.stack(x)

In [73]:
MATRIX_SIZE = 4     # i.e. 2x2
NUM_GENERATORS = 4  # Number of generators in group (including inverses)

class DeepQAgent:
    def __init__(self, learning_rate: float, epsilon: float, epsilon_decay: float, 
                 min_epsilon: float, discount_factor: float, batch_size: int, memory_capacity: int):
        # Initialize main an target models and set weights to be equivalent
        self.mainModel = DeepQModel(MATRIX_SIZE, NUM_GENERATORS)
        self.targetModel = DeepQModel(MATRIX_SIZE, NUM_GENERATORS)
        
        # TODO: check if this works
        self.copy_main_to_target()

        # Initialize Hyperparameters
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.discount_factor = discount_factor
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity

        # Optimization initialization
        self.optimizer = optim.Adam(self.mainModel.parameters(), lr=self.learning_rate)

        # Experience Replay Buffers and parameters
        self.memory = deque()
       
    def epsilon_greedy_search(self, state):
        if torch.rand(1) <= self.epsilon:
            return int(NUM_GENERATORS * torch.rand(1))
        x = torch.tensor(state)
        return self.mainModel.forward(x).argmax()

    def copy_main_to_target(self):
        self.targetModel.load_state_dict(self.mainModel.state_dict())

    def replay(self):
        # Ensure there is enough memory for a full batch
        if len(self.memory) <  self.batch_size:
            return

        # Random sample a batch_size's worth of memory
        minibatch = random.sample(self.memory, self.batch_size)

        # Vectorizing data
        states, actions, rewards, next_states, dones = zip(*minibatch)
        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)
        
        # Bellman Equation: (Reward + discount_factor * state_prediction)
        target_predictions = self.targetModel.forward(next_states).max(dim=1)   # states
        bellmans = rewards + self.discount_factor * target_predictions
        next_q = self.mainModel.forward(states).max(dim=1)                      # next_states

        # Calculate loss and perform optimization step
        loss = F.mse_loss(next_q, bellmans)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Perform Epsilon decay for each training step
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)
        
    def remember(self, state, action, reward, next_state, done: bool):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > self.memory_capacity:
            self.memory.popleft()



In [74]:
class DeepQAgentTrainer: 
    def __init__(self, agent: DeepQAgent, batch_size) -> None:
        self.batch_size = batch_size
        self.agent = agent
        pass

In [75]:
df = pd.read_csv(base_fp)
print(df)

         val1   val2   val3   val4  num_moves_Q_learning_needs  \
0        29.0   12.0   70.0   29.0                           5   
1      1657.0  298.0 -506.0  -91.0                          12   
2       -43.0   16.0    8.0   -3.0                           6   
3        65.0 -112.0  148.0 -255.0                           8   
4      -147.0   86.0   94.0  -55.0                           8   
...       ...    ...    ...    ...                         ...   
69994  -167.0  304.0 -128.0  233.0                          10   
69995    53.0   72.0  304.0  413.0                          10   
69996  -331.0 -154.0 -144.0  -67.0                           9   
69997   -39.0   28.0 -124.0   89.0                          10   
69998   -15.0  -94.0    4.0   25.0                           7   

       first_move_by_Q_learning  
0                             3  
1                             3  
2                             1  
3                             0  
4                             1  
...