In [1]:
import argparse
import sys
import time
import random
import traceback
from collections import deque
from pprint import pprint
import wandb
import numpy as np

In [2]:
import torch.optim as optim
from mlagents_envs.environment import UnityEnvironment, ActionTuple
from mlagents_envs.side_channel.environment_parameters_channel import EnvironmentParametersChannel

In [3]:
from gymnasium import spaces 
from stable_baselines3.common.buffers import ReplayBuffer

from training_utils import *

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


# Args

In [4]:
import argparse
import sys

def parse_args(default_config_path="./config/train.yaml"):
    """
    Parse arguments from CLI or notebook.
    - In notebook: usa il default se non passato
    - In CLI: permette override dei parametri nel config
    """
    # --- Gestione notebook: evita crash su ipykernel args ---
    argv = sys.argv[1:]
    # Se siamo in notebook o non è passato il config_path, inseriamo il default
    if len(argv) == 0 or "--f=" in " ".join(argv):
        argv = [default_config_path]

    # --- Pre-parser per leggere il config_path ---
    pre_parser = argparse.ArgumentParser(add_help=False)
    pre_parser.add_argument(
        "config_path",
        type=str,
        nargs="?",
        default=default_config_path,
        help="Main config file path"
    )
    initial_args, remaining_argv = pre_parser.parse_known_args(argv)
    CONFIG_PATH = initial_args.config_path
    print(f"Config path: {CONFIG_PATH}")

    # --- Legge parametri dal file di config ---
    file_config_dict = parse_config_file(CONFIG_PATH)

    # --- Parser principale ---
    parser = argparse.ArgumentParser(description="Training Script")
    parser.add_argument(
        "config_path",
        type=str,
        nargs="?",
        default=CONFIG_PATH,
        help="Main config file path"
    )

    # Aggiunge parametri dal config file, con tipi corretti
    for key, value in file_config_dict.items():
        if isinstance(value, bool):
            parser.add_argument(f"--{key}", type=str2bool, default=value)
        elif value is None:
            parser.add_argument(f"--{key}", type=str, default=value)
        else:
            parser.add_argument(f"--{key}", type=type(value), default=value)

    # --- Parse finale con remaining_argv per ignorare args extra Jupyter ---
    args, unknown = parser.parse_known_args(remaining_argv)
    if unknown:
        print("Ignored unknown args:", unknown)
    return args


In [5]:
args = parse_args()
agent_config = parse_config_file(args.agent_config_path)
obstacles_config = parse_config_file(args.obstacles_config_path)
other_config = parse_config_file(args.other_config_path)

args.seed = random.randint(0, 2**16)
# args.name = generate_funny_name()

print('Training with the following parameters:')
pprint(vars(args))

print('agent_config:')
pprint(agent_config)

print('obstacles_config:')
pprint(obstacles_config)

print('other_config:')
pprint(other_config)

Config path: ./config/train.yaml
Training with the following parameters:
{'actor_network_layers': [128, 128, 128],
 'agent_config_path': './config/agent.yaml',
 'alpha': 0.2,
 'alpha_lr': 0.0001,
 'autotune': True,
 'base_time': 1765457030,
 'batch_size': 256,
 'bootstrap': True,
 'bootstrap_batch_proportion': 0.8,
 'buffer_size': 120000,
 'build_path': './unity_build/3xold_wind/UASRL.exe',
 'config_path': './config/train.yaml',
 'cuda': 0,
 'env_id': '3xold',
 'exp_name': 'old_pers',
 'gamma': 0.995,
 'headless': False,
 'input_stack': 4,
 'learning_starts': 1000,
 'loss_log_interval': 500,
 'machine_name': 'personal',
 'metrics_log_interval': 10,
 'metrics_smoothing': 0.98,
 'n_envs': 3,
 'noise_clip': 0.0,
 'obstacles_config_path': './config/obstacles_simple.yaml',
 'other_config_path': './config/other.yaml',
 'policy_frequency': 4,
 'policy_lr': 0.0001,
 'q_ensemble_n': 5,
 'q_lr': 0.0001,
 'q_network_layers': [128, 128, 128],
 'reward_scale': 1.0,
 'seed': 58984,
 'target_entropy'

In [6]:
if torch.cuda.is_available() and args.cuda >= 0:
    # F-string per inserire l'indice: diventa "cuda:2"
    device_str = f"cuda:{args.cuda}"
else:
    device_str = "cpu"

DEVICE = torch.device(device_str)
print(f"Using device: {DEVICE}")

Using device: cuda:0


# Seeding

In [7]:
# seeding
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
print(f'Seed: {args.seed}')

Seed: 58984


# Start Environment

In [8]:
# Create the channel
env_info = CustomChannel()
param_channel = EnvironmentParametersChannel()

print('Applying Unity settings from config...')
apply_unity_settings(param_channel, agent_config, 'ag_')
apply_unity_settings(param_channel, obstacles_config, 'obs_')

if args.test_lib:
    print('Testing Ended')
    exit(0)

# env setup
print(f'Starting Unity Environment from build: {args.build_path}')
# args.build_path
env = UnityEnvironment(None, 
                       seed=args.seed, 
                       side_channels=[env_info, param_channel], 
                       no_graphics=args.headless,
                       worker_id=args.worker_id)
print('Unity Environment connected.')

Applying Unity settings from config...
Starting Unity Environment from build: ./unity_build/3xold_wind/UASRL.exe
Unity Environment connected.


In [9]:
print('Resetting environment...')
env.reset()

Resetting environment...


# Environment Variables and Log

In [None]:
run_name = f"{args.exp_name}_{int(time.time()) - args.base_time}"
args.run_name = run_name
print(f"Run name: {run_name}")

# wandb to track experiments
# Start a new wandb run to track this script.
if args.wandb:
    print('Setting up wandb experiment tracking.')
    wandb_run = wandb.init(
        # Set the wandb entity where your project will be logged (generally your team name).
        entity="giacomo-aru",
        # Set the wandb project where this run will be logged.
        project="UASRL",
        # force the 
        name=args.run_name,
        # Track hyperparameters and run metadata.
        config={
            "training": vars(args),
            "agent": agent_config,
            "obstacles": obstacles_config,
            "other": other_config
        }

    )

Run name: old_pers_3987336
Setting up wandb experiment tracking.


[34m[1mwandb[0m: Currently logged in as: [33mgiacomoaru[0m ([33mgiacomo-aru[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
BEHAVIOUR_NAME = other_config['behavior_name'] + '?team=' + other_config['team']

RAY_PER_DIRECTION = other_config['rays_per_direction']
RAYCAST_MIN = other_config['rays_min_observation']
RAYCAST_MAX = other_config['rays_max_observation']
RAYCAST_SIZE = 2*RAY_PER_DIRECTION + 1

STATE_SIZE = other_config['state_observation_size'] - 1
STATE_MIN = other_config['state_min_observation']
STATE_MAX = other_config['state_max_observation']

ACTION_SIZE = other_config['action_size']
ACTION_MIN = other_config['min_action']
ACTION_MAX = other_config['max_action']

TOTAL_STATE_SIZE = (STATE_SIZE + RAYCAST_SIZE)*args.input_stack

In [12]:
# creating the training networks
print('Creating actor and critic networks...')
actor = OldDenseActor(TOTAL_STATE_SIZE, ACTION_SIZE, ACTION_MIN, ACTION_MAX, args.actor_network_layers).to(DEVICE)
actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr)

qf_ensemble = [OldDenseSoftQNetwork(TOTAL_STATE_SIZE, ACTION_SIZE, args.q_network_layers).to(DEVICE) for _ in range(args.q_ensemble_n)]
qf_ensemble_target = [OldDenseSoftQNetwork(TOTAL_STATE_SIZE, ACTION_SIZE, args.q_network_layers).to(DEVICE) for _ in range(args.q_ensemble_n)]
for q_t, q in zip(qf_ensemble_target, qf_ensemble):
    q_t.load_state_dict(q.state_dict())

par = []
for q in qf_ensemble:
    par += list(q.parameters())
qf_optimizer = torch.optim.Adam(
    par,
    lr=args.q_lr
)

Creating actor and critic networks...


# Replay Buffer

In [13]:
print('Setting up replay buffer...')
observation_space = spaces.Box(
    low=min(RAYCAST_MIN, STATE_MIN), 
    high=max(RAYCAST_MAX, STATE_MAX), 
    shape=(TOTAL_STATE_SIZE,), 
    dtype=np.float32
)
action_space = spaces.Box(
    low=ACTION_MIN, 
    high=ACTION_MAX, 
    shape=(ACTION_SIZE,), 
    dtype=np.float32
)

rb = ReplayBuffer(
    buffer_size=args.buffer_size,
    observation_space=observation_space,
    action_space=action_space,
    device=DEVICE,                
    handle_timeout_termination=True,
    n_envs=1 # necessario data la natura asincrona del'env   
)

Setting up replay buffer...


# start algorithm

In [14]:
# Automatic entropy tuning
if args.autotune:
    target_entropy = args.target_entropy
    log_alpha = torch.zeros(1, requires_grad=True, device=DEVICE)
    alpha = log_alpha.exp().item()
    a_optimizer = optim.Adam([log_alpha], lr=args.alpha_lr)
    print(f'autotune target_entropy: {target_entropy}')
else:
    alpha = args.alpha

autotune target_entropy: -1.0


In [15]:
# start training
save_path = './models/' + run_name
os.makedirs(save_path, exist_ok=True)
print('saving to path:', save_path)

training_stats = {
    "time/python_time": RunningMean(),
    "time/unity_time": RunningMean(),
    
    "stats/action_saturation": RunningMean(),
    'stats/qf_mean': RunningMean(),
    'stats/qf_std':RunningMean(),
    'stats/actor_entropy': RunningMean(),
    'stats/alpha': RunningMean(),
    'stats/uncertainty': RunningMean(),
    
    'loss/critic_ens': RunningMean(),
    'loss/actor': RunningMean(),
    'loss/alpha': RunningMean(),
}

best_reward = -float('inf')

episodic_stats = {}
success_stats = {}
failure_stats = {}

saving to path: ./models/old_pers_3987336


In [16]:
start_time = time.time()
unity_end_time = -1
unity_start_time = -1

global_step = 0
print(f'[{global_step}/{args.total_timesteps}] Starting Training - run name: {run_name}')

[0/100000] Starting Training - run name: old_pers_3987336


In [None]:
try:
    obs = collect_data_after_step(env, BEHAVIOUR_NAME, STATE_SIZE)
    
    
    while global_step < args.total_timesteps:

        # actions for each agent in the environment
        # dim = (naagents, action_space)
        for id in obs:
            agent_obs = obs[id]
            
            # terminated agents are not considered
            if agent_obs[3]:
                continue
            
            # algo logic
            if global_step < args.learning_starts * 2:
                # change this to use the handcrafted starting policy or a previously trained policy
                
                action = get_initial_action(id)
                # action, _, _ = old_actor.get_action(torch.Tensor([obs[id][0]]), 
                #                                 torch.Tensor([obs[id][1]]),
                #                                 0.5)
                # action = action[0].detach().numpy()
            else:
                # training policy
                action, _, _ = actor.get_action(torch.Tensor([obs[id][0]]).to(DEVICE))
                action = action[0].detach().cpu().numpy()
            
            # memorize the action taken for the next step
            agent_obs[2] = action
            
            # the first dimention of the action is the "number of agent"
            # Always 1 if "set_action_for_agent" is used
            a = ActionTuple(continuous=np.array([action]))
            env.set_action_for_agent(BEHAVIOUR_NAME, id, a)
        
        # --- ENVIRONMENT STEP ---
        unity_start_time = time.time()
        if unity_end_time > 0 and global_step > args.learning_starts:
            training_stats['time/python_time'].update(unity_start_time - unity_end_time)
        
        env.step()
        unity_end_time = time.time()
        if global_step > args.learning_starts:
            training_stats['time/unity_time'].update(unity_end_time - unity_start_time)

        next_obs = collect_data_after_step(env, BEHAVIOUR_NAME, STATE_SIZE)
        
        while env_info.stop_msg_queue:
                msg = env_info.stop_msg_queue.pop()
                
                if global_step >= args.learning_starts:
                    update_stats_from_message(episodic_stats, success_stats, failure_stats, msg, args.metrics_smoothing)        
                    if episodic_stats['ep_count'] % args.metrics_log_interval == 0:
                        print_update(global_step, args.total_timesteps, start_time, episodic_stats)
                        if args.wandb:
                            log_stats_to_wandb(wandb_run, 
                                            [episodic_stats, success_stats, failure_stats],
                                            ['all_ep', 'success_ep', 'failure_ep'],
                                            global_step)
                            print(f"[{global_step}/{args.total_timesteps}] Logged episodic stats to wandb")
                        
        # save data to reply buffer; handle `terminal_observation`
        for id in obs:
            prev_agent_obs = obs[id]
            # consider every agent that in the previous step was not terminated
            # in this way are excluded the agents that are already considered before and don't have a 
            # couple prev_obs - next_obs and a reward
            if prev_agent_obs[3] or id not in next_obs:
                continue
                
            next_agent_obs = next_obs[id]
            
            # add the data to the replay buffer
            rb.add(obs = prev_agent_obs[0], 
                next_obs = next_agent_obs[0],
                action = np.array(prev_agent_obs[2]), 
                reward = next_agent_obs[1], 
                done = next_agent_obs[3],
                infos = [{}])
            
        # crucial step, easy to overlook, update the previous observation
        obs = next_obs
        
        # Save best models based on reward
        if episodic_stats != {} and episodic_stats["reward"] > best_reward:
            best_reward = episodic_stats["reward"]
            save_models(actor, qf_ensemble, qf_ensemble_target, save_path, suffix=f'_best')
            print(f"[{global_step}/{args.total_timesteps}] Models saved, suffix: _best")
                
        # Training loop
        for _ in range(args.update_frequency):

            # Start learning after a warm-up phase
            if global_step > args.learning_starts:

                # Sample a batch from replay buffer
                data = rb.sample(args.batch_size)

                # --- CALCOLO SATURAZIONE ---
                saturation = data.actions.detach().cpu().numpy()
                saturation = (np.abs(saturation) > 0.99).mean()
                training_stats["stats/action_saturation"].update(saturation)
                    
                with torch.no_grad():
                    # Compute target action with exploration noise
                    next_action, next_log_pi, _ = actor.get_action(
                        data.next_observations
                    )

                    if args.noise_clip > 0:
                        noise = torch.randn_like(next_action) * args.noise_clip
                        next_action = torch.clamp(next_action + noise, -1, 1)

                    # Compute target Q-value (min over ensemble)
                    target_q_values = []
                    for q_target in qf_ensemble_target:
                        q_val = q_target(
                            data.next_observations, 
                            next_action
                        )
                        target_q_values.append(q_val)
                    stacked_target_q = torch.stack(target_q_values)
                    min_qf_next_target = stacked_target_q.min(dim=0).values - alpha * next_log_pi
                    next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * min_qf_next_target.view(-1)

                # Q-function updates (with bootstrapping)
                q_losses = []
                q_vals = []
                batch_size = int(data.actions.shape[0] * args.bootstrap_batch_proportion)
                for q in qf_ensemble:
                    # Bootstrap indices
                    indices = torch.randint(0, batch_size, (batch_size,), device=data.actions.device)
                    
                    observation = data.observations[indices]
                    actions = data.actions[indices]
                    target = next_q_value[indices]

                    # Compute Q loss
                    q_val = q(observation, actions).view(-1)
                    loss = F.mse_loss(q_val, target)
                    q_losses.append(loss)
                    q_vals.append(q_val)
                    
                total_q_loss = torch.stack(q_losses).mean()
                qf_optimizer.zero_grad()
                total_q_loss.backward()
                qf_optimizer.step()
                
                # Track Q-value statistics
                all_q_values = torch.cat(q_vals)
                training_stats['stats/qf_mean'].update(all_q_values.mean().item())
                training_stats['stats/qf_std'].update(all_q_values.std().item())
                training_stats['loss/critic_ens'].update(total_q_loss.item())
                
                # Delayed policy (actor) update
                if global_step % args.policy_frequency == 0:
                    for _ in range(args.policy_frequency):
                        pi, log_pi, _ = actor.get_action(data.observations)
                        actor_entropy = - (log_pi.exp() * log_pi).sum(dim=-1).mean()

                        q_pi_vals = [q(data.observations, pi) for q in qf_ensemble]
                        min_qf_pi = torch.min(torch.stack(q_pi_vals), dim=0).values.view(-1)

                        actor_loss = ((alpha * log_pi) - min_qf_pi).mean()

                        actor_optimizer.zero_grad()
                        actor_loss.backward()
                        actor_optimizer.step()

                        # 1. Calcolo Incertezza (Disaccordo tra i critici)
                        q_pi_stack = torch.stack(q_pi_vals) 
                        with torch.no_grad():
                            uncertainty = q_pi_stack.std(dim=0).mean().item()
                        training_stats['stats/uncertainty'].update(uncertainty)

                        # 2. Log Entropia e Loss Attore
                        training_stats['stats/actor_entropy'].update(-log_pi.mean().item())
                        training_stats['loss/actor'].update(actor_loss.item())
                        
                        # Automatic entropy tuning (if enabled)
                        if args.autotune:
                            with torch.no_grad():
                                _, log_pi, _ = actor.get_action(data.observations)
                            alpha_loss = (-log_alpha * (log_pi + target_entropy)).mean()

                            a_optimizer.zero_grad()
                            alpha_loss.backward()
                            a_optimizer.step()
                            alpha = log_alpha.exp().item()
                            
                            training_stats['loss/alpha'].update(alpha_loss.item())

                        training_stats['stats/alpha'].update(alpha)
                        
                # Soft update target Q-networks
                if global_step % args.target_network_update_period == 0:
                    for q, q_t in zip(qf_ensemble, qf_ensemble_target):
                        for param, target_param in zip(q.parameters(), q_t.parameters()):
                            target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)

                # --- 5. LOGGING LOSS (METODO SNAPSHOT/ISTANTANEO) ---
                if global_step % args.loss_log_interval == 0:

                    # COSTRUZIONE DIZIONARIO SNAPSHOT
                    training_stats_divided = {}
                    for key in training_stats:
                        splitted = key.split('/')
                        if splitted[0] not in training_stats_divided:
                            training_stats_divided[splitted[0]] = {}
                        training_stats_divided[splitted[0]][splitted[1]] = training_stats[key].mean
                        
                        # reset
                        training_stats[key].reset()
                        
                    current_time = time.time()
                    training_stats_divided['time']['SPS'] = global_step / (current_time - start_time + 1e-6)
                    
                    # log stats su wandb
                    if args.wandb:
                        log_stats_to_wandb(wandb_run, list(training_stats_divided.values()), list(training_stats_divided.keys()), global_step)
                        print(f"[{global_step}/{args.total_timesteps}] Logged training stats to wandb")  
                            
            elif global_step == args.learning_starts:
                print("Start Learning")

            # Step counter
            global_step += 1
            
except Exception as e:  
    print(f"[{global_step}/{args.total_timesteps}] An error occurred: {e}")
    traceback.print_exc()

Start Learning
[1199/100000] Models saved, suffix: _best
[1399/100000] Models saved, suffix: _best
[1500/100000] Logged training stats to wandb
[1599/100000] Models saved, suffix: _best
[1799/100000] |success: 0.00000|reward: -19.39581|collisions: 3.27195|length: 999.00000|SPL: 0.00000| SPS: 9
[1799/100000] Logged episodic stats to wandb
[1799/100000] Models saved, suffix: _best
[1999/100000] Models saved, suffix: _best
[2000/100000] Logged training stats to wandb


  action, _, _ = actor.get_action(torch.Tensor([obs[id][0]]).to(DEVICE))


[2399/100000] |success: 0.00000|reward: -19.27406|collisions: 3.75876|length: 999.00000|SPL: 0.00000| SPS: 9
[2399/100000] Logged episodic stats to wandb
[2399/100000] Models saved, suffix: _best
[2500/100000] Logged training stats to wandb
[2999/100000] |success: 0.00000|reward: -19.21135|collisions: 3.85709|length: 999.00000|SPL: 0.00000| SPS: 9
[2999/100000] Logged episodic stats to wandb
[2999/100000] Models saved, suffix: _best
[3000/100000] Logged training stats to wandb
[3399/100000] Models saved, suffix: _best
[3500/100000] Logged training stats to wandb
[3599/100000] Models saved, suffix: _best
[3799/100000] |success: 0.00000|reward: -18.68291|collisions: 3.90895|length: 999.00000|SPL: 0.00000| SPS: 9
[3799/100000] Logged episodic stats to wandb
[3999/100000] Models saved, suffix: _best
[4000/100000] Logged training stats to wandb
[4399/100000] |success: 0.00000|reward: -18.49331|collisions: 4.02248|length: 999.00000|SPL: 0.00000| SPS: 9
[4399/100000] Logged episodic stats to 

Traceback (most recent call last):
  File "C:\Users\cicci\AppData\Local\Temp\ipykernel_13896\438019796.py", line 43, in <module>
    env.step()
  File "c:\Users\cicci\Desktop\UASRL\.venv\lib\site-packages\mlagents_envs\timers.py", line 305, in wrapped
    return func(*args, **kwargs)
  File "c:\Users\cicci\Desktop\UASRL\.venv\lib\site-packages\mlagents_envs\environment.py", line 348, in step
    outputs = self._communicator.exchange(step_input, self._poll_process)
  File "c:\Users\cicci\Desktop\UASRL\.venv\lib\site-packages\mlagents_envs\rpc_communicator.py", line 142, in exchange
    self.poll_for_timeout(poll_callback)
  File "c:\Users\cicci\Desktop\UASRL\.venv\lib\site-packages\mlagents_envs\rpc_communicator.py", line 114, in poll_for_timeout
    raise UnityTimeOutException(
mlagents_envs.exception.UnityTimeOutException: The Unity environment took too long to respond. Make sure that :
	 The environment does not need user interaction to launch
	 The Agents' Behavior Parameters > Beha

# Close Environment

In [18]:
print("Closing environment")
env.close()

print("Closing wandb run")
wandb.finish()

Closing environment
Closing wandb run


0,1
all_ep/SPL,▁▁▁▁▁▅█▇▆▅▄▄▃▃▂▂▂▂▂▂▁▁▁
all_ep/average_speed,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
all_ep/collisions,▁▃▃▃▄▅▅▄▅▅▅▄▆▅█▇▆▆▆▆▇▇▆
all_ep/distance_traveled,▁▁▂▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇██▇██
all_ep/ep_count,▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇██
all_ep/global_avg_dispersion,▃▅▇▇▅▂▁▁▃▂▅▄▆█▇▆▅▆▃▃▂▃▁
all_ep/global_avg_dist_obstacle,███▆▅▄▄▄▄▃▃▃▄▅▄▄▃▃▂▂▂▂▁
all_ep/global_avg_visibility,▁▃▆▅▄▂▂▃▄▂▅▄▆█▇▇▅▅▄▃▃▄▃
all_ep/global_characteristic_dimension,▁▃▅▅▅▄▅▅▆▅▆▅▇██▇▆▆▅▅▅▅▅
all_ep/length,█████▅▁▂▃▄▅▅▆▆▇▇▇▇▇▇███

0,1
all_ep/SPL,0.00137
all_ep/average_speed,0
all_ep/collisions,4.58799
all_ep/distance_traveled,2.73625
all_ep/ep_count,230
all_ep/global_avg_dispersion,7.3958
all_ep/global_avg_dist_obstacle,3.26906
all_ep/global_avg_visibility,10.15503
all_ep/global_characteristic_dimension,10.28331
all_ep/length,997.7999


In [19]:
# save trained networks, actor and critics
save_models(actor, qf_ensemble, qf_ensemble_target, save_path, suffix='_final')
print(f"[{global_step}/{args.total_timesteps}] Models saved, suffix: _final")

[16982/100000] Models saved, suffix: _final
