## Training and evaluation of DQN

---

> Internship neural networks
>
> Group 4: Reinforcement learning
>
> Deadline 28.02.23 23:59

---

In [24]:
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
import os

from itertools import count

In [25]:
%run "../Environment/Connect4.ipynb"
%run "../utils/utils.ipynb"
%run "../OtherAgents/Agents.ipynb"
%run "../utils/utils.ipynb"
%run "DQN.ipynb"
%run "utils.ipynb"

# Hyperparameters and Configuration

In [26]:
EPS_START = 1 # epsilon decay parameters
EPS_MIN = 0.05
EPS_DELTA = 0.9998

BATCH_SIZE = 64 # numbers of samples to draw from the replay buffer
GAMMA = 0.9 # discount of the rewards
TAU = 0.005 # polyak average update rate for the target network
lr = 1e-4 # learning rate for 

REPLAY_SIZE = 10000 # number of transitions that can be stored in the replay buffer shouldn't be too small

NUM_EPISODES = 40000 # the number of games in the training

torch.manual_seed(42) # setting seed to reproduce results
random.seed(42)

# Set the device (cuda or cpu)

In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Setup Environment

Define environment, dqnagent and opponents

In [28]:
env = Connect4()

# get max no. of actions from action space
n_actions = env.board_width

dqn = DQNAgent(n_actions, lr, REPLAY_SIZE)

randomPlayer = RandomAgent()

# Update function

Updates the DQN policy and target net according to the difference in the q-values and the expected q-values

- Double DQN
- Updates the priorities for the batch
- Backpropagates the loss through the network

In [29]:
def optimize_model(optimizer, memory, policy_net, target_net) -> torch.tensor:
    '''
    The optimization function of the DQN.
    
    optimizer: the optimizer of the agent to update the parameters
    memory: the replay memory of the agent with the transitions
    policy_net: the policy net of the agent to calculate the q-values
    target_net: the target net of the agent to get the target q-values (expected q values)
    
    returns: the loss for the batch
    '''
    if len(memory) < BATCH_SIZE:
        return
    
    # Get the transitions from the batch
    transitions, indices = memory.sample(BATCH_SIZE)
    state_batch, action_batch, reward_batch, next_state_batch = zip(*[(np.expand_dims(m[0], axis=0), \
                                    [m[1]], m[2], np.expand_dims(m[3], axis=0)) for m in transitions])
    # tensor wrapper
    state_batch = torch.tensor(np.array(state_batch), dtype=torch.float, device=device)
    reward_batch = torch.tensor(np.array(reward_batch), dtype=torch.float, device=device)
    action_batch = torch.tensor(np.asarray(action_batch), dtype=torch.int64, device=device)
    
    # for assigning terminal state value = 0 later
    non_final_mask = torch.tensor(tuple(map(lambda s_: s_[0] is not None, next_state_batch)), device=device)
    non_final_next_states = [torch.tensor(s_, dtype=torch.float, device=device).unsqueeze(0) for s_ in next_state_batch if s_[0] is not None]
    if len(non_final_next_states) > 0:
        non_final_next_states = torch.cat(non_final_next_states)

    # prediction from policy_net (q-values for the actions)
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    
    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        if len(non_final_next_states) > 0:
            next_state_actions = torch.argmax(policy_net(non_final_next_states).detach(), dim=1)
            next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1).detach()
    
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    
    # Update the priorities according to the difference in the q-values
    delta = abs(expected_state_action_values.unsqueeze(1) - state_action_values) + 1e-5
    memory.update_priorities(indices, delta)
    
    loss = F.huber_loss(state_action_values, expected_state_action_values.unsqueeze(1))
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()
    
    return loss

In [30]:
# avoid resetting
steps_done = 0
training_history = []
training_history_player2 = []
WIN_RATE_TEST = 1000
LOSSES = []
NUM_EPISODES = 40000

# Training function

The function helps training the DQN network against the opponents.

- MixedTraining is the training against NegaMax opoonents of various depth interchangeably
- Play games against another player with randomly assigning the player position
- Updates the policy net after each game
- Polyak average for target network

In [31]:
def train(p1, p2 = None, EPS = 1, num_episodes = 1000):
    '''
    The training function. 
    
    - Agent plays games against an opponent
    - Updates the parameters of the network
    - Saves data and model parameters
    
    p1: A player
    p2: Another player
    EPS: The epsilon for the exploration
    num_episodes: How many games will be played
    
    returns: the training history
    '''
    
    # Define parameters for the training
    steps_done = 0
    depth = 0
    mixedTraining = False
    if p2 == None:
        mixedTraining = True
    
    # Loop for the games to play
    for i in tqdm(range(num_episodes)):
        
        # record every 1000 epochs the win rate of the agent as player1 and player2 respectively
        if i % WIN_RATE_TEST == WIN_RATE_TEST-1:
            print("Player 1:")
            win_rate, movestaken, _, _ = win_rate_test(p1, p2, 100)
            training_history.append([i + 1, win_rate, movestaken])
            print("Player 2:")
            _, _, win_rate_p2, moves_taken_p2 = win_rate_test(p2, p1, 100)
            training_history_player2.append([i + 1, win_rate_p2, moves_taken_p2])
            
        # Reset the environment
        env.reset()
        state_p1 = env.board_state.copy()
        
        # For the mixed training choose the depth of the negamax
        if mixedTraining:
            if i % WIN_RATE_TEST == WIN_RATE_TEST-1:
                print("depth: ", depth)
            depth = np.random.choice([2,3,4])
            p2 = NegaMaxAgent(env, depth)
            
        # Define player order randomly for the dqn to play as both players
        j = np.random.choice([0,1])
        if j == 0:
            player1 = p1
            player2 = p2
        else:
            player1 = p2
            player2 = p1
        
        # Loop over one game    
        for t in count():
            # First player select action and make a move in the environment
            available_actions = env.get_available_actions()
            action_p1 = player1.select_action(state_p1, available_actions, EPS, steps_done)
            steps_done += 1
            state_p1_, reward_p1 = env.make_move(action_p1, 'p1', isDqn = True)
            
            # Check if environment is done and push into memory
            if env.isDone:
                if reward_p1 == 0:
                    # state action value tuple for a draw
                    if player1.memory != None:
                        player1.memory.push(state_p1.copy(), action_p1, reward_p1, None)
                    if player2.memory != None:
                        player2.memory2.push(-temp.copy(), action_p2, reward_p1, None)
                else:
                    # reward p1 for p1's win
                    if player1.memory != None:
                        player1.memory.push(state_p1.copy(), action_p1, reward_p1, None)
                    if player2.memory != None:
                        player2.memory2.push(-temp.copy(), action_p2, -reward_p1, None)
                break
            
            # Second player select action and make a move in the environment
            available_actions = env.get_available_actions()
            action_p2 = player2.select_action(state_p1_, available_actions, EPS, steps_done)
            state_p2_, reward_p2 = env.make_move(action_p2, "p2", isDqn = True)

            # Check if environment is done and push into memory
            if env.isDone:
                if reward_p2 == 0:
                    # state action value tuple for a draw
                    if player1.memory != None:
                        player1.memory.push(state_p1.copy(), action_p1, reward_p2, None)
                    if player2.memory != None:
                        player2.memory2.push(-state_p1_.copy(), action_p2, reward_p2, None)
                else:
                    # punish p1 for (random agent) p2's win 
                    if player1.memory != None:
                        player1.memory.push(state_p1.copy(), action_p1, -reward_p2, None)
                    if player2.memory != None:
                        player2.memory2.push(-state_p1_.copy(), action_p2, reward_p2, None)
                break
                
            # Push experience into memory (negative state for player2 network)
            if t != 0:
                if player2.memory != None:
                    player2.memory2.push(-temp.copy(), temp1, reward_p2, -state_p1_.copy())
            if player1.memory != None:
                player1.memory.push(state_p1.copy(), action_p1, reward_p1, state_p2_.copy())
                
            # Copy temporal information for next step memory
            state_p1 = state_p2_
            temp = state_p1_.copy()
            temp1 = action_p2
        
        # Perform one step of the optimization (on the policy network)
        if player1.policy_net != None:
            loss = optimize_model(player1.optimizer, player1.memory, player1.policy_net, player1.target_net)
            LOSSES.append(loss)

        if player2.policy_net != None:
            loss = optimize_model(player2.optimizer2, player2.memory2, player2.policy_net2, player2.target_net2)
            LOSSES.append(loss)
                
        # Update the epsilon
        EPS = EPS * EPS_DELTA
        EPS = max(EPS_MIN, EPS)
        
        # Reset player positions
        if j == 0:
            p1 = player1
            p2 = player2
        else:
            p2 = player1
            p1 = player2
            
        # Soft update of the target network's weights (for both player networks)
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = p1.target_net.state_dict()
        policy_net_state_dict = p1.policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        p1.target_net.load_state_dict(target_net_state_dict)
        
        target_net_state_dict = p1.target_net2.state_dict()
        policy_net_state_dict = p1.policy_net2.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        p1.target_net2.load_state_dict(target_net_state_dict)
          
        # Save model after 10000 iterations
        if i % 10000 == 0:
            path = FOLDER + '/DQN_Epochs' + str(i) + '_player1.pth'
            torch.save(p1.policy_net.state_dict(), path)
            path = FOLDER + '/DQN_Epochs' + str(i) + '_player2.pth'
            torch.save(p1.policy_net2.state_dict(), path)

# Training

## Training against the random

In [32]:
FOLDER = "final_dqn_against_random"

if not os.path.isdir(FOLDER):
    os.mkdir(FOLDER)
dqn.memory.memory.clear()
dqn.memory2.memory.clear()
dqn.memory.priorities = np.array([])
dqn.memory2.priorities = np.array([])
train(dqn, randomPlayer, EPS_START, NUM_EPISODES)
th = np.array(training_history)
th_p2 = np.array(training_history_player2)
write_list(th_p2, FOLDER + "/training_history_player2_path")
write_list(th, FOLDER + "/training_history_path")
write_list(LOSSES, FOLDER + "/losses")
path = FOLDER + '/FINAL_DQN_Epochs' + str(NUM_EPISODES) + '_player1.pth'
torch.save(dqn.policy_net.state_dict(), path)
path = FOLDER + '/FINAL_DQN_Epochs' + str(NUM_EPISODES) + '_player2.pth'
torch.save(dqn.policy_net2.state_dict(), path)

  1%|▏                                      | 221/40000 [00:06<19:56, 33.24it/s]


KeyboardInterrupt: 

In [None]:
create_plots(th)

In [None]:
create_plots(th_p2)

## Training against the NegaMaxAgent

In [22]:
FOLDER = "final_dqn_against_negaMax2"

if not os.path.isdir(FOLDER):
    os.mkdir(FOLDER)
nega_max = NegaMaxAgent(env, 2)
dqn.memory.memory.clear()
dqn.memory2.memory.clear()
dqn.memory.priorities = np.array([])
dqn.memory2.priorities = np.array([])
train(dqn, nega_max, 0.7, NUM_EPISODES)
th = np.array(training_history)
th_p2 = np.array(training_history_player2)
write_list(th_p2, FOLDER + "/training_history_player2_path")
write_list(th, FOLDER + "/training_history_path")
write_list(LOSSES, FOLDER + "/losses")
path = FOLDER + '/FINAL_DQN_Epochs' + str(NUM_EPISODES) + '_player1.pth'
torch.save(dqn.policy_net.state_dict(), path)
path = FOLDER + '/FINAL_DQN_Epochs' + str(NUM_EPISODES) + '_player2.pth'
torch.save(dqn.policy_net2.state_dict(), path)

  2%|▉                                      | 997/40000 [00:35<26:07, 24.88it/s]

Player 1:
Absolute wins_p1, wins_p2, draws:  (20, 80, 0)
Relative wins_p1, wins_p2, draws  [0.2, 0.8, 0.0]
Player 2:


  3%|▉                                   | 1004/40000 [00:40<4:08:36,  2.61it/s]

Absolute wins_p1, wins_p2, draws:  (88, 10, 2)
Relative wins_p1, wins_p2, draws  [0.88, 0.1, 0.02]


  5%|█▉                                    | 1999/40000 [01:18<25:08, 25.20it/s]

Player 1:
Absolute wins_p1, wins_p2, draws:  (23, 77, 0)
Relative wins_p1, wins_p2, draws  [0.23, 0.77, 0.0]
Player 2:


  5%|█▊                                  | 2002/40000 [01:24<6:30:12,  1.62it/s]

Absolute wins_p1, wins_p2, draws:  (96, 2, 2)
Relative wins_p1, wins_p2, draws  [0.96, 0.02, 0.02]


  7%|██▊                                   | 2996/40000 [02:04<24:59, 24.68it/s]

Player 1:
Absolute wins_p1, wins_p2, draws:  (21, 79, 0)
Relative wins_p1, wins_p2, draws  [0.21, 0.79, 0.0]
Player 2:


  8%|██▋                                 | 3003/40000 [02:11<4:54:28,  2.09it/s]

Absolute wins_p1, wins_p2, draws:  (98, 1, 1)
Relative wins_p1, wins_p2, draws  [0.98, 0.01, 0.01]


 10%|███▊                                  | 3999/40000 [02:54<27:20, 21.95it/s]

Player 1:
Absolute wins_p1, wins_p2, draws:  (5, 95, 0)
Relative wins_p1, wins_p2, draws  [0.05, 0.95, 0.0]
Player 2:
Absolute wins_p1, wins_p2, draws:  (90, 6, 4)
Relative wins_p1, wins_p2, draws  [0.9, 0.06, 0.04]


 10%|███▉                                  | 4128/40000 [03:08<27:18, 21.89it/s]


KeyboardInterrupt: 

In [None]:
create_plots(th)

In [None]:
create_plots(th_p2)

## Mixtraining

- Training against negamax agents of different depths (random choise each game)

In [None]:
FOLDER = "final_dqn_against_negaMaxMix"

if not os.path.isdir(FOLDER):
    os.mkdir(FOLDER)

train(p1 = dqn, EPS = 0.5, num_episodes = 20000)
th = np.array(training_history)
th_p2 = np.array(training_history_player2)
write_list(th_p2, FOLDER + "/training_history_player2_path")
write_list(th, FOLDER + "/training_history_path")
write_list(LOSSES, FOLDER + "/losses")
path = FOLDER + '/FINAL_DQN_Epochs' + str(NUM_EPISODES) + '_player1.pth'
torch.save(dqn.policy_net.state_dict(), path)
path = FOLDER + '/FINAL_DQN_Epochs' + str(NUM_EPISODES) + '_player2.pth'
torch.save(dqn.policy_net2.state_dict(), path)

In [None]:
create_plots(th)

In [None]:
create_plots(th_p2)

###### 