# Regular RL Trainining Pipeline for PPO Agent to Play Levels from the Doom Game

In [1]:
from datetime import datetime

# Storing start time for saving models and logs with timestamp
start_datetime = datetime.now()
start_datetime_timestamp_str = start_datetime.strftime('%Y_%m_%d_%H_%M_%S')

# Setting up agent training config
global_step = 0
start_time = start_datetime.timestamp()
num_steps = 256
num_envs = 8
num_mini_batches = 32
num_training_epochs=10
batch_size = int(num_envs * num_steps)
mini_batch_size = batch_size // num_mini_batches
total_timesteps = 900000
num_updates = total_timesteps // batch_size


In [2]:
from torch.utils.tensorboard import SummaryWriter
from agents.doom_ppo_agent import DoomPpoAgent
from utils.replay_buffer import ReplayBuffer
from utils.env import make_vizdoom_env
import gymnasium as gym

# Initializing environments
envs = gym.vector.SyncVectorEnv([ make_vizdoom_env('envs/vizdoom/scenarios/basic.cfg') for i in range(num_envs)])

# Setting up agent
agent = DoomPpoAgent(envs.single_observation_space, 
                     envs.single_action_space,
                     learning_rate=0.0001,
                     use_gpu=True)

# Creating replay buffer for storing transitions
replay_buffer = ReplayBuffer(num_steps, 
                             num_envs, 
                             envs.envs[0].raw_observation_space, 
                             envs.single_observation_space, 
                             envs.single_action_space)

# Setting up debugging for Tensorboard
tensorboard_writer = SummaryWriter(f"logs/ppo_agent/doom_basic_level/training_{start_datetime_timestamp_str}")

  logger.warn(
  logger.warn(


In [3]:
import numpy as np
import time

observations, infos = envs.reset()
terminations = [ 0 for _ in range(num_envs) ]
best_average_return = float('-inf')
returns = []

for update in range(1, num_updates + 1):
    # Calculating learning rate annealing coefficient
    learning_rate_anneal_coef = 1.0 - (update - 1.0) / num_updates

    for step in range(0, num_steps):
        global_step += num_envs

        # Getting next action and it's value
        actions, log_probs, probs, values = agent.forward(observations)
        values = values.flatten()

        observations_, rewards, terminations_, truncations, infos = envs.step(actions)

        # Saving transitions in replay buffer
        replay_buffer[step] = (
            np.stack(infos["raw_observations"]),
            observations,
            actions,
            log_probs,
            rewards,
            values,
            terminations
        )

        # Saving new observation and done status for next step
        observations = observations_
        terminations =  terminations_
        
        if 'final_info' in infos:
            for env_info in infos['final_info']:
                if env_info is not None and "episode" in env_info.keys():
                    print(f"global_step={global_step}, episodic_return={env_info['episode']['r']}")

                    # Recording returns
                    returns.append(env_info['episode']['r'])

                    # Writing step debug info to TensorBoard
                    tensorboard_writer.add_scalar("charts/episodic_return", env_info["episode"]["r"], global_step)
                    tensorboard_writer.add_scalar("charts/episodic_length", env_info["episode"]["l"], global_step)
                    break

    # Checking if the current mean is higher than previous highest mean and saving the model
    current_mean_episodic_return = np.mean(returns)
    print(f"Current Mean Episodic Return = {current_mean_episodic_return}")
    if current_mean_episodic_return > best_average_return:
        # Saving the model
        agent.save_models(f"./models/doom_ppo_agent/rl_pipeline/training_run_{start_datetime_timestamp_str}/checkpoint_step_{global_step}")
        
        # Saving new best average return and clearing returns arrays
        best_average_return = current_mean_episodic_return
        returns.clear()
    
    # Training the agent
    training_stats = agent.train(
        replay_buffer=replay_buffer,
        learning_rate_anneal_coef=learning_rate_anneal_coef,
        mini_batch_size=mini_batch_size,
        num_training_epochs=num_training_epochs
    )

    print("SPS:", int(global_step / (time.time() - start_time)))

    tensorboard_writer.add_scalar("charts/learning_rate", training_stats.learning_rate, global_step)
    tensorboard_writer.add_scalar("losses/value_loss", training_stats.value_loss, global_step)
    tensorboard_writer.add_scalar("losses/policy_loss", training_stats.policy_loss, global_step)
    tensorboard_writer.add_scalar("losses/entropy_loss", training_stats.entropy_loss, global_step)
    tensorboard_writer.add_scalar("charts/old_approx_kl", training_stats.old_approx_kl, global_step)
    tensorboard_writer.add_scalar("charts/approx_kl", training_stats.approx_kl, global_step)
    tensorboard_writer.add_scalar("charts/clip_fraction", training_stats.clip_fraction, global_step)
    tensorboard_writer.add_scalar("charts/explained_variance", training_stats.explained_variance, global_step)
    tensorboard_writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)


global_step=48, episodic_return=[95.]
global_step=64, episodic_return=[93.]
global_step=72, episodic_return=[92.]
global_step=112, episodic_return=[93.]
global_step=128, episodic_return=[94.]
global_step=272, episodic_return=[70.]
global_step=280, episodic_return=[67.]
global_step=376, episodic_return=[89.]
global_step=432, episodic_return=[94.]
Current Mean Episodic Return = 87.44444274902344
Saving models...
Directory './models/doom_ppo_agent/rl_pipeline/training_run_2023_07_31_15_45_39/checkpoint_step_2048' created!
Successfully saved models!
SPS: 27
global_step=2400, episodic_return=[-375.]
global_step=2448, episodic_return=[95.]
global_step=2512, episodic_return=[-375.]
global_step=2528, episodic_return=[-380.]
global_step=2672, episodic_return=[-380.]
global_step=2720, episodic_return=[95.]
global_step=2728, episodic_return=[69.]
global_step=2760, episodic_return=[67.]
global_step=2784, episodic_return=[94.]
global_step=2808, episodic_return=[90.]
global_step=2832, episodic_retur

KeyboardInterrupt: 

In [None]:
# Closing environments
envs.close()
# Closing tensorboard writer
tensorboard_writer.close()