## DDQN in Hockey Enviroment - Evaluation

Evaluate a trained DDQN against various environments

Base DQN implementation adapted from HW7

In [1]:
import copy
import os
import sys
from importlib import reload

import numpy as np
from tqdm.notebook import tqdm

# Adding the parent directory to the path to enable importing
root_dir = os.path.dirname(os.path.abspath("../"))
if root_dir not in sys.path:
    sys.path.append(root_dir)

import DDQN.DDQN as ddqn
from DDQN.dqn_action_space import CustomActionSpace
from DDQN.DQN import TargetDQNAgent, DoubleDQNAgent
from DDQN.DDQN import DuelingDQNAgent, DoubleDuelingDQNAgent
from DDQN.dqn_evaluation import compare_agents, display_stats
from DDQN.dqn_trainer import RandomWeaknessBasicOpponent, SACOpponent
import hockey.hockey_env as h_env

reload(h_env)
reload(ddqn)

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


<module 'DDQN.DDQN' from '/home/kivanc/ders/RL/project/RL-Hockey/DDQN/DDQN.py'>

In [2]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

## Environment & Agent Initialization

In [18]:
env = h_env.HockeyEnv(mode=h_env.Mode.NORMAL)
env.reset()
act_space = CustomActionSpace()

In [23]:
agent_player = DoubleDuelingDQNAgent(
    env.observation_space,
    act_space,
    hidden_sizes=[512],
    hidden_sizes_A=[512, 512],
    hidden_sizes_V=[512, 512],
    use_torch=True
)

if isinstance(agent_player.action_space, CustomActionSpace):
    discrete_to_continuous = CustomActionSpace.discrete_to_continuous
else:
    discrete_to_continuous = env.discrete_to_continous_action

In [12]:
# agent_player = DoubleDQNAgent(
#     env.observation_space,
#     env.discrete_action_space,
#     hidden_sizes=[2048],
#     use_torch=False
# )

In [24]:
STATE_PATH = "./models/doub-duel-dqn-per-4it-custactspc/Q_model_best_strong_ep58800_wr0.97.ckpt"

In [25]:
agent_player.load_state(STATE_PATH)

agent_opp_weak = h_env.BasicOpponent(weak=True)
agent_opp_strong = h_env.BasicOpponent(weak=False)
agent_opp_random = RandomWeaknessBasicOpponent(0.2)
#agent_opp_self = copy.deepcopy(agent_player)
# FIXME: This is a hack to avoid copying, see FIXME in dqn_trainer.py
agent_opp_self = DoubleDuelingDQNAgent(
    env.observation_space,
    agent_player.action_space,
    hidden_sizes=[512],
    hidden_sizes_A=[512, 512],
    hidden_sizes_V=[512, 512],
    use_torch=True
)
agent_opp_self.load_state(STATE_PATH)
agent_opp_sac = SACOpponent(env=env, pth_dir="/home/kivanc/ders/RL/project/muhteshember-models/karahan-sac/champion2.pth")

Loaded configuration from checkpoint.
[SACAgent] 'buffer_state' missing or buffer has no set_state().


## Evaluation

### Winning Rates Against Opponents & Match Statistics

In [26]:
weak_opp_stats = compare_agents(
    agent_player, agent_opp_weak, env, act_space, num_matches=1000, tqdm=tqdm
)
strong_opp_stats = compare_agents(
    agent_player, agent_opp_strong, env, act_space, num_matches=1000, tqdm=tqdm
)
self_opp_stats = compare_agents(
    agent_player, agent_opp_self, env, act_space, num_matches=1000, tqdm=tqdm
)
sac_stats = compare_agents(
    agent_player, agent_opp_sac, env, act_space, num_matches=1000, tqdm=tqdm
)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [30]:
print("#" * 40)
display_stats(weak_opp_stats, opp_name="Weak")

print()
print("#" * 40)
display_stats(strong_opp_stats, opp_name="Strong")

print()
print("#" * 40)
display_stats(self_opp_stats, opp_name="Self Copied")

print()
print("#" * 40)
display_stats(sac_stats, opp_name="SAC")

########################################
Weak Opponent:
Player Win Rate: 0.931
Opponent Win Rate: 0.008
Draw Rate: 0.061

Win Status (1 for win, 0 for draw, -1 for loss):
  Mean: 0.923
  Std: 0.2950779558015136

Returns:
  Player: 8147.188055166428
  Opponent: -12379.878966850789
  Difference: 20527.067022017218

########################################
Strong Opponent:
Player Win Rate: 0.962
Opponent Win Rate: 0.013
Draw Rate: 0.025

Win Status (1 for win, 0 for draw, -1 for loss):
  Mean: 0.949
  Std: 0.2727618008446197

Returns:
  Player: 8097.551757563351
  Opponent: -11890.65735833587
  Difference: 19988.20911589922

########################################
Self Copied Opponent:
Player Win Rate: 0.434
Opponent Win Rate: 0.481
Draw Rate: 0.085

Win Status (1 for win, 0 for draw, -1 for loss):
  Mean: -0.047
  Std: 0.9554009629469712

Returns:
  Player: -2681.9665395363563
  Opponent: -1669.5879489390627
  Difference: 1012.3785905972936

########################################
SAC 

### Rendered Demonstration Against Copied Self

In [34]:
obs_buffer = []
reward_buffer = []
obs, _ = env.reset()
obs_opp = env.obs_agent_two()

done = False
trunc = False
step = 0
while not (done or trunc):
    step += 1
    env.render()

    a1_discr = agent_player.act(obs)
    a1 = discrete_to_continuous(a1_discr)
    a2_discr = agent_opp_self.act(obs_opp)
    a2 = discrete_to_continuous(a2_discr)

    obs, r, done, trunc, info = env.step(np.hstack([a1, a2]))
    obs_buffer.append(obs)
    reward_buffer.append(r)

    obs_opp = env.obs_agent_two()

    if done or trunc:
        print(f"Episode done in {step} steps")
        winner = info["winner"]
        if winner == 0:
            print("Draw")
        else:
            print(f"Winner: " + ("Player" if winner == 1 else "Opponent"))
        break

Episode done in 30 steps
Winner: Player


### Rendered Demonstration Against Strong Basic Opponent

In [37]:
obs_buffer = []
reward_buffer = []
obs, _ = env.reset()
obs_opp = env.obs_agent_two()

done = False
trunc = False
step = 0
while not (done or trunc):
    step += 1
    env.render()

    a1_discr = agent_player.act(obs)
    a1 = discrete_to_continuous(a1_discr)
    a2 = agent_opp_strong.act(obs_opp)

    obs, r, done, trunc, info = env.step(np.hstack([a1, a2]))
    obs_buffer.append(obs)
    reward_buffer.append(r)

    obs_opp = env.obs_agent_two()

    if done or trunc:
        print(f"Episode done in {step} steps")
        winner = info["winner"]
        if winner == 0:
            print("Draw")
        else:
            print(f"Winner: " + ("Player" if winner == 1 else "Opponent"))
        break

Episode done in 163 steps
Winner: Player


### Rendered Demonstration Against SAC Opponent

In [51]:
obs_buffer = []
reward_buffer = []
obs, _ = env.reset()
obs_opp = env.obs_agent_two()

done = False
trunc = False
step = 0
while not (done or trunc):
    step += 1
    env.render()

    a1_discr = agent_player.act(obs)
    a1 = discrete_to_continuous(a1_discr)
    a2 = agent_opp_sac.act(obs_opp)

    obs, r, done, trunc, info = env.step(np.hstack([a1, a2]))
    obs_buffer.append(obs)
    reward_buffer.append(r)

    obs_opp = env.obs_agent_two()

    if done or trunc:
        print(f"Episode done in {step} steps")
        winner = info["winner"]
        if winner == 0:
            print("Draw")
        else:
            print(f"Winner: " + ("Player" if winner == 1 else "Opponent"))
        break

Episode done in 66 steps
Winner: Opponent


In [52]:
env.close()