## DQN

---

> Internship neural networks
>
> Group 4: Reinforcement learning
>
> Deadline 28.02.23 23:59

---

In [3]:
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### Define network

This is the network used to calculate the q-values

In [7]:
class DQN(nn.Module):
    '''
    This is the feed forward network to predict the q values for the given states.

    n_actions (int): the number of actions in the game environment to predict.
    '''
    def __init__(self, n_actions) -> None:
        super(DQN, self).__init__()
        # Convolution layers 
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5, padding=2)
        self.conv2 = nn.Conv2d(32, 32, kernel_size=5, padding=2)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, padding=2)

        linear_input_size = 6 * 7 * 32
        
        # fc layers
        self.MLP1 = nn.Linear(linear_input_size, 50)
        self.MLP3 = nn.Linear(50, 50)
        self.MLP4 = nn.Linear(50, n_actions)
        
        #Dropout (set to zero)
        self.dropout = nn.Dropout(0.0)
        self.dropout_cnn = nn.Dropout(0.0)
        
    def forward(self, x) -> torch.tensor:
        '''
        Feeds the input through the model
        
        x (ndarray 6x7): board state to feed through the model
        
        returns (torch.tensor): The action for the state
        '''
        x = self.dropout_cnn(F.leaky_relu(self.conv1(x)))
        x = self.dropout_cnn(F.leaky_relu(self.conv2(x)))
        x = self.dropout_cnn(F.leaky_relu(self.conv3(x)))

        # flatten the feature vector except batch dimension
        x = x.view(x.size(0), -1)
        x = self.dropout(F.leaky_relu(self.MLP1(x)))
        x = self.dropout(F.leaky_relu(self.MLP3(x)))
        return self.MLP4(x)

In [8]:
class DQNAgent():
    '''
    This is the class for the DQN agent.
    
    n_actions (int): number of actions which determine the output dimensionality of the dqn.
    lr (float): the learning rate for the optimizer
    replay_size (int): the size of the replay buffer
    '''
    def __init__(self, n_actions, lr, replay_size) -> None:
        #Define policy-net, target-net, optimizer and memory for playing as player1 and player2
        self.policy_net = DQN(n_actions).to(device)
        # target_net will be updated with the polyak average in the training
        self.target_net = DQN(n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
        self.policy_net2 = DQN(n_actions).to(device)
        self.target_net2 = DQN(n_actions).to(device)
        self.target_net2.load_state_dict(self.policy_net2.state_dict())

        self.optimizer = optim.AdamW(self.policy_net.parameters(), lr = lr, amsgrad=True)
        self.optimizer2 = optim.AdamW(self.policy_net2.parameters(), lr = lr, amsgrad=True)

        self.memory = ReplayMemory(replay_size)
        self.memory2 = ReplayMemory(replay_size)

    def select_action(self, state, available_actions, EPS = 0, steps_done=None, training=True) -> int:
        '''
        This function selects the action for the given states in an epsilon greedy way.
        
        state (ndarray 6x7): the current state that is observerd.
        available_actions (ndarray 6): the allowed actions in the environment.
        EPS (float): the epsilon parameter for exploration
        steps_done (int): the number of steps that are already done.
        training (boolean): controls the epsilon greedy policy.
        
        returns (int): the action with the highest q-value for the given state.
        '''
        # set the state
        if state.sum() == 1:
            state = -state
        state = torch.tensor(state, dtype=torch.float, device=device).unsqueeze(dim=0).unsqueeze(dim=0)
        
        # Decide for greedy or random decision
        if training == True:
            act = np.random.choice(['model','random'], 1, p=[1-EPS, EPS])[0]
        else:
            act = 'model'
        
        # follow epsilon-greedy policy
        if act == 'model':
            with torch.no_grad():
                # action recommendations from policy net
                if state.sum() == 0:
                    r_actions = self.policy_net(state)[0, :]
                else:
                    r_actions = self.policy_net2(state)[0, :]
                state_action_values = [r_actions[action] for action in available_actions]
                for i in range(len(state_action_values)):
                    state_action_values[i] = state_action_values[i].cpu()
                argmax_action = np.argmax(state_action_values)
                greedy_action = available_actions[argmax_action]
                return greedy_action
        # choose random action
        else:
            return random.choice(available_actions)