In [1]:
import time
import random
from collections import deque
from pprint import pprint

import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from mlagents_envs.environment import UnityEnvironment, ActionTuple
from mlagents_envs.side_channel.environment_parameters_channel import EnvironmentParametersChannel
import numpy as np
from gymnasium import spaces 
from stable_baselines3.common.buffers import ReplayBuffer

from utils_policy_train import *

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [None]:
CONFIG_PATH = './test_config/debug.yaml'
BASE_TIME = 1763635551

# Args

In [3]:
args = parse_config(CONFIG_PATH)
args.seed = random.randint(0, 2**16)
# args.name = generate_funny_name()

pprint(vars(args))

{'actor_network_layers': [16, 16, 16, 16],
 'alpha': 1.0,
 'alpha_lr': 0.0004,
 'autotune': True,
 'batch_size': 128,
 'bootstrap': True,
 'bootstrap_batch_proportion': 0.85,
 'buffer_size': 10000,
 'cuda': True,
 'env_id': 'std',
 'exp_name': 'base+wp',
 'gamma': 0.995,
 'input_stack': 6,
 'learning_starts': 25,
 'loss_log_interval': 100,
 'metrics_log_interval': 300,
 'metrics_smoothing': 0.985,
 'noise_clip': 0.1,
 'policy_frequency': 2,
 'policy_lr': 0.0004,
 'q_ensemble_n': 6,
 'q_lr': 0.0004,
 'q_network_layers': [16, 16, 16, 16],
 'reward_scale': 0.001,
 'seed': 61537,
 'target_network_frequency': 2,
 'tau': 0.01,
 'torch_deterministic': True,
 'total_timesteps': 100000,
 'update_per_step': 2}


In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cpu


# Seeding

In [5]:
# seeding
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

# Start Environment

In [6]:
# Create the channel
env_info = CustomChannel()
param_channel = EnvironmentParametersChannel()

# env setup
env = UnityEnvironment(None, seed=args.seed, side_channels=[env_info, param_channel])

In [7]:
env.reset()

# Environment Variables and Log

In [None]:
run_name = f"{args.exp_name}_{int(time.time()) - BASE_TIME}"
args.full_name = run_name

In [9]:
env_info.settings

{'agent_settings': {'step_after_goal': 10,
  'max_step': 1000,
  'agent_id': 3,
  'max_movement_speed': 6.0,
  'max_turn_speed': 140.0,
  'move_smooth_time': 0.10000000149011612,
  'goal_reward': 10.0,
  'wall_hit_penalty': 0.0,
  'progress_reward': 0.5,
  'stagnation_penalty': -0.20000000298023224,
  'ema_range_penalty': 5.0,
  'ema_smoothing': 0.10000000149011612},
 'ray_sensor_settings': {'sensor_name': 'RayPerceptionSensor',
  'rays_per_direction': 10,
  'max_ray_degrees': 90.0,
  'sphere_cast_radius': 0.0,
  'ray_length': 10.0,
  'observation_stacks': 1,
  'alternating_ray_order': False,
  'use_batched_raycasts': True,
  'min_observation': -1.0,
  'max_observation': 1.0,
  'ignore_last_ray': False},
 'behaviour_parameters_settings': {'behavior_name': 'NavigationAgent',
  'observation_size': 10,
  'stacked_vector': 1,
  'min_observation': -256.0,
  'max_observation': 256.0,
  'continuous_actions': 2,
  'min_action': -1.0,
  'max_action': 1.0},
 'obstacles_settings': {'obstacles_tot

In [None]:
if env_info.settings['ray_sensor_settings']['rays_per_direction'] > 1:
    print('ERRORE RAYCAST STACK > 1') # ???
if env_info.settings['behaviour_parameters_settings']['stacked_vector'] > 1:
    print('ERRORE STATE STACK > 1')

ERRORE RAYCAST STACK > 1


In [11]:
BEHAVIOUR_NAME = env_info.settings['behaviour_parameters_settings']['behavior_name'] + '?team=0'

RAY_PER_DIRECTION = env_info.settings['ray_sensor_settings']['rays_per_direction']
RAYCAST_MIN = env_info.settings['ray_sensor_settings']['min_observation']
RAYCAST_MAX = env_info.settings['ray_sensor_settings']['max_observation']
RAYCAST_SIZE = 2*RAY_PER_DIRECTION + 1

STATE_SIZE = env_info.settings['behaviour_parameters_settings']['observation_size'] - 1
STATE_MIN = env_info.settings['behaviour_parameters_settings']['min_observation']
STATE_MAX = env_info.settings['behaviour_parameters_settings']['max_observation']

ACTION_SIZE = env_info.settings['behaviour_parameters_settings']['continuous_actions']
ACTION_MIN = env_info.settings['behaviour_parameters_settings']['min_action']
ACTION_MAX = env_info.settings['behaviour_parameters_settings']['max_action']

TOTAL_STATE_SIZE = (STATE_SIZE + RAYCAST_SIZE)*args.input_stack

In [None]:
# creating the training networks
actor = DenseActor(TOTAL_STATE_SIZE, ACTION_SIZE, ACTION_MIN, ACTION_MAX, args.actor_network_layers).to(DEVICE)
actor.load_state_dict(torch.load(args.model_path + f'/actor_{args.model_checkpoint}.pth'))
actor.eval()

# start algorithm

In [None]:
# start training
save_path = './models/' + run_name
os.makedirs(save_path, exist_ok=True)
episodic_stats = None

In [16]:
decision_obs, terminal_obs = observe_batch_stacked(env, BEHAVIOUR_NAME, args.input_stack, TOTAL_STATE_SIZE)

print(f'Start Training - {run_name}')
start_time = time.time()
unity_end_time = -1
unity_start_time = -1
global_step = 0

Start Training - base+wp_974502


In [None]:
save_test = []

In [None]:
while global_step < args.total_timesteps:

    # --- ACTION SELECTION ---
    if global_step < args.learning_starts * 2:
        action = get_initial_action_batch(decision_obs[0])
    else:
        obs_tensor = torch.as_tensor(decision_obs[1], dtype=torch.float32).to(DEVICE)
        
        actor.eval()
        with torch.no_grad():
            action, _, _, _ = actor.get_action(obs_tensor)
        action = action.cpu().numpy()
    
    # Action Taken
    # decision_obs.append(action) 
    if len(action) > 1: 
        a = ActionTuple(continuous=action)
        env.set_actions(BEHAVIOUR_NAME, a)
    
    # --- ENVIRONMENT STEP ---
    unity_start_time = time.time()
    env.step()
    unity_end_time = time.time()
            
    next_decision_obs, next_terminal_obs = observe_batch_stacked(env, BEHAVIOUR_NAME, args.input_stack, TOTAL_STATE_SIZE)
    
    global_step += 1 # Aggiunto incremento step (mancava!)

    # --- STATS UPDATE ---
    while env_info.stop_msg_queue:
        msg = env_info.stop_msg_queue.pop()
        
        if global_step >= args.learning_starts:
            if episodic_stats is None:
                episodic_stats = {}
                for key in msg:
                    if key != 'id': episodic_stats[key] = msg[key]
            else:
                for key in msg:
                    if key != 'id': 
                        episodic_stats[key] = episodic_stats[key]*args.metrics_smoothing + (1 - args.metrics_smoothing)*msg[key]

    # update current obs
    decision_obs = next_decision_obs
    terminal_obs = next_terminal_obs

Start Learning  - base+wp_974502
[300/100000] |success: 0.00000|collisions: 4.00044|reward: -177.55041|length: 1001.00000| SPS: 6
[600/100000] |success: 0.00000|collisions: 4.00175|reward: -176.75864|length: 1001.00000| SPS: 7
[900/100000] |success: 0.00000|collisions: 3.87247|reward: -177.25134|length: 1001.00000| SPS: 8
[1200/100000] |success: 0.01500|collisions: 3.74987|reward: -174.70464|length: 988.23500| SPS: 8
ERRORE NAN IN OSSERVAZIONI NEXT
ERRORE NAN IN OSSERVAZIONI NEXT
ERRORE NAN IN OSSERVAZIONI NEXT
ERRORE NAN IN OSSERVAZIONI NEXT
[1500/100000] |success: 0.01329|collisions: 3.71971|reward: -175.59699|length: 989.68875| SPS: 9
ERRORE NAN IN OSSERVAZIONI NEXT
ERRORE NAN IN OSSERVAZIONI NEXT
[1800/100000] |success: 0.02732|collisions: 3.66799|reward: -173.74098|length: 981.67702| SPS: 9
ERRORE NAN IN OSSERVAZIONI NEXT
ERRORE NAN IN OSSERVAZIONI NEXT
ERRORE NAN IN OSSERVAZIONI NEXT
ERRORE NAN IN OSSERVAZIONI NEXT
[2100/100000] |success: 0.02421|collisions: 3.54758|reward: -173.

UnityCommunicatorStoppedException: Communicator has exited.

# Close Environment

In [None]:
# close environment
env.close()