In [1]:
import numpy as np
import torch
import gymnasium as gym
from gymnasium import spaces

import imageio
import os
import random

from Dueling_DQN_Agent.Dueling_DQN_Agent import Dueling_DQN_Agent
import Dueling_DQN_Agent.utils.help_classes as hc
import Dueling_DQN_Agent.utils.stats_functions as sf

import hockey.hockey_env as h_env

from importlib import reload


In [2]:
SEED_TRAIN_1 = 7489
SEED_TRAIN_2 = 1312
SEEDS_TEST = [291, 292, 293, 294, 295]

seed = SEED_TRAIN_1

In [5]:
reload(h_env)
env_name = "Hockey_test_1"
env = h_env.HockeyEnv()
"""if isinstance(env.action_space, spaces.Box):
    env = hc.DiscreteActionWrapper(env,5)"""

# Initialize the agent with the correct state/action space
state_space = env.observation_space
action_space = env.action_space
num_actions = env.num_actions
#agent = Dueling_DQN_Agent(state_space, action_space, num_actions = num_actions, use_eps_decay = True, seed = seed)
agent = Dueling_DQN_Agent(state_space, action_space, num_actions = num_actions, eps= 0, seed = seed)
opponent = h_env.BasicOpponent()

stats = []
losses = []

max_episodes = 1500
max_steps = 1000
train_iterations = 32  # Number of training steps per episode

for episode in range(max_episodes):

    state, _ = env.reset(seed = seed)
    state = state[0] if isinstance(state, tuple) else state  # Handle Gymnasium compatibility

    obs_agent2 = env.obs_agent_two()
    obs_agent2 = obs_agent2[0] if isinstance(obs_agent2, tuple) else obs_agent2

    total_reward = 0
    
    for t in range(max_steps):
        done = False
        a1 = env.discrete_to_continous_action(agent.perform_greedy_action(state))
        a2 = opponent.act(obs_agent2)

        print(f"a1: {a1}, shape: {np.asarray(a1).shape}")
        print(f"a2: {a2}, shape: {np.asarray(a2).shape}")

        full_action = np.hstack([a1, a2])

        print(f"full action: {full_action}, shape: {full_action.shape}")

        next_state, reward, done, truncated, _ = env.step(full_action)
        
        total_reward += reward

        agent.buffer.add_transition((state, a1, reward, next_state, done))      # Store transition in the agent"s memory and then train

        state = next_state
        obs_agent2 = env.obs_agent_two()

        if done: break
        
    losses.extend(agent.train(train_iterations))
    stats.append([episode, total_reward, t + 1])
    
    if agent._config["use_eps_decay"] and episode > int(0.5 * max_episodes):
        agent._perform_epsilon_decay()

    print(f"Episode {episode+1}/{max_episodes}, Total Reward: {total_reward}")
        
    if ((episode) % int(max_episodes/10) == 0) and episode > 0:
        agent.Q.save(env_name, name = f"episode_{episode}")

agent.Q.save(env_name, name = "training_finished")
sf.save_stats(env_name, stats, losses)


action: 2, shape: ()
a1: [1.0, 0.0, 0.0, 0.0], shape: (4,)
a2: [ 0.92988    -0.02738976  0.11309841  0.        ], shape: (4,)
full action: [ 1.          0.          0.          0.          0.92988    -0.02738976
  0.11309841  0.        ], shape: (8,)
action: 2, shape: ()
a1: [1.0, 0.0, 0.0, 0.0], shape: (4,)
a2: [ 0.9125658  -0.02734509  0.08522077  0.        ], shape: (4,)
full action: [ 1.          0.          0.          0.          0.9125658  -0.02734509
  0.08522077  0.        ], shape: (8,)
action: 2, shape: ()
a1: [1.0, 0.0, 0.0, 0.0], shape: (4,)
a2: [ 0.87999401 -0.02724962 -0.16578184  0.        ], shape: (4,)
full action: [ 1.          0.          0.          0.          0.87999401 -0.02724962
 -0.16578184  0.        ], shape: (8,)
action: 2, shape: ()
a1: [1.0, 0.0, 0.0, 0.0], shape: (4,)
a2: [ 0.83429012 -0.02698417  0.01301578  0.        ], shape: (4,)
full action: [ 1.          0.          0.          0.          0.83429012 -0.02698417
  0.01301578  0.        ], shape: (

RuntimeError: gather(): Expected dtype int64 for index