## Evaluation DQN against decision transformer

---

> Internship neural networks
>
> Group 4: Reinforcement learning
>
> Deadline 28.02.23 23:59

---

In [1]:
%run "Environment/Connect4.ipynb"
%run "OtherAgents/Agents.ipynb"
%run "DecisionTransformer/DecisionTransformer.ipynb"
%run "DQN/DQN.ipynb"
%run "DQN/utils.ipynb"

In [2]:
env = Connect4()
device_name = 'cuda'
device = torch.device(device_name)

dt = DTAgent(state_dim=42,
            act_dim=1,
            n_blocks=4,
            hidden_dim=128,
            context_len=10,
            n_heads=2,
            drop_p=0.1,
            rtg_target = 10,
            vocab_size = 7)

# example path
path = 'DecisionTransformer/dt_training/dt_Connect4_batch_size=64_context_len=10_n_blocks=4_hidden_dim=128_n_heads=2_model_23-02-28-17-54-34_80000.pt'
dt.model.load_state_dict(torch.load(path, device))

dqn = DQNAgent(n_actions = 7, lr = 1e-4, replay_size = 1)
path = 'DQN/final_dqn_against_negaMaxMix/DQN_AgainstNegaMax_Epochs20000_player1.pth'
dqn.policy_net.load_state_dict(torch.load(path, device))
path = 'DQN/final_dqn_against_negaMaxMix/DQN_AgainstNegaMax_Epochs20000_player2.pth'
dqn.policy_net2.load_state_dict(torch.load(path, device))

<All keys matched successfully>

In [3]:
def evaluate(player1, player2, eval_epsiodes = 100, render = False):
    '''
    evaluate the wins, loses and draws over a number of games
    
    player1: The first player
    player2: The second player
    eval_epsiodes: number of games
    render: if true, render games in output
    
    returns: winrate for each player and draw rate
    '''
    max_test_ep_len = 21
    total_timesteps = 0
    total_reward = 0
    running_reward = 0
    wins_p1 = 0
    wins_p2 = 0
    draws = 0
        
    name_p1 = player1.__class__.__name__
    name_p2 = player2.__class__.__name__
    p1_is_dt = False
    p2_is_dt = False
    
    if name_p1 == "DTAgent":
        player1.model.eval()
        p1_is_dt = True
    if name_p2 == "DTAgent":
        player2.model.eval()
        p2_is_dt = True
        
    for i in range(eval_epsiodes):

        with torch.no_grad():

            running_state = env.reset()
            
            if render:
                env.render()
            if p1_is_dt:
                player1.reset_agent()
            if p2_is_dt:
                player2.reset_agent()


            for t in range(max_test_ep_len):
                
                available_actions = env.get_available_actions()

                if p1_is_dt:
                    action_p1 = player1.select_action(t, running_reward, running_state, available_actions)
                else:
                    action_p1 = player1.select_action(running_state, available_actions, training=False)
                running_state, running_reward = env.make_move(action_p1, "p1")
                
                if p1_is_dt:
                    total_reward += running_reward

                if render:
                    env.render()
                if env.isDone:
                    if running_reward == 10:
                        wins_p1 += 1
                    else:
                        draws += 1
                    break
                    
                available_actions = env.get_available_actions()

                if p2_is_dt:
                    action_p2 = player2.select_action(t, running_reward, running_state, available_actions)
                else:
                    action_p2 = player2.select_action(running_state, available_actions, training=False)

                running_state, running_reward = env.make_move(action_p2, 'p2')
                
                if p2_is_dt:
                    total_reward += running_reward

                if render:
                    env.render()
                if env.isDone:
                    if running_reward == 10:
                        wins_p2 += 1
                    else:
                        draws += 1
                    break
    print("winrate_p1: ", wins_p1/eval_epsiodes)
    print("winrate_p2: ", wins_p2/eval_epsiodes)
    print("drawrate: ", draws/eval_epsiodes)
    
    return wins_p1/eval_epsiodes, wins_p2/eval_epsiodes, draws/eval_epsiodes

In [4]:
evaluate(dt, dqn)

winrate_p1:  0.44
winrate_p2:  0.54
drawrate:  0.02


In [5]:
evaluate(dqn, dt)

winrate_p1:  0.91
winrate_p2:  0.09
drawrate:  0.0


In [16]:
evaluate(dt, dqn, 1, render = True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,O,,,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,X,,,O,,,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,X,,,O,,O,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,X,X,,O,,O,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,X,X,,O,O,O,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,X,
5,X,X,,O,O,O,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,O,
4,,,,,,X,
5,X,X,,O,O,O,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,X,
3,,,,,,O,
4,,,,,,X,
5,X,X,,O,O,O,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,
2,,,,,,X,
3,,,,,,O,
4,,O,,,,X,
5,X,X,,O,O,O,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,X,
2,,,,,,X,
3,,,,,,O,
4,,O,,,,X,
5,X,X,,O,O,O,


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  rendered_board_state = self.board_state.copy().astype(np.str)


Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,X,
2,,,,,,X,
3,,,,,,O,
4,,O,,,,X,
5,X,X,,O,O,O,O


winrate_p1:  1.0
winrate_p2:  0.0
drawrate:  0.0
