# Regular RL Trainining Pipeline for PPO Agent to Play Levels from the Doom Game

In [1]:
from utils.env import make_vizdoom_env
import gymnasium as gym

# Initializing environments
num_envs = 1
envs = gym.vector.SyncVectorEnv([ make_vizdoom_env('envs/vizdoom/scenarios/basic.cfg', render_mode='rgb_array', record_episodes=True) for i in range(num_envs)])


  logger.warn(
  logger.warn(


In [2]:
envs.single_action_space

Discrete(4)

In [2]:
from torch.utils.tensorboard import SummaryWriter
from agents.doom_ppo_agent import DoomPpoAgent
from utils.memory import Memory
from utils.time import current_timestamp_ms
from datetime import datetime
import torch

# Prefering GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.has_mps:
    device = torch.device("mps")
else:
    device = torch.device("cpu")

# Setting up agent
agent = DoomPpoAgent(envs.single_observation_space, 
                     envs.single_action_space,
                     learning_rate=0.0001)
agent.to(device)

# Setting up agent training config
global_step = 0
start_datetime = datetime.now()
start_time = start_datetime.timestamp()
num_steps = 256
num_mini_batches = 32
num_training_epochs=10
batch_size = int(num_envs * num_steps)
mini_batch_size = batch_size // num_mini_batches
total_timesteps = 300000
num_updates = total_timesteps // batch_size
memory = Memory(device, num_steps, num_envs, envs.single_observation_space.shape, envs.single_action_space.shape)

# Setting up debugging for Tensorboard
tensorboard_writer = SummaryWriter(f"logs/ppo_agent/doom_basic_level/training_{current_timestamp_ms()}")

In [3]:
import torch
import numpy as np
import time

observations, info = envs.reset()
observations = torch.Tensor(observations).to(device)
done = torch.zeros(num_envs).to(device)
best_average_return = float('-inf')
returns = []

for update in range(1, num_updates + 1):
    # Calculating learning rate annealing coefficient
    learning_rate_anneal_coef = 1.0 - (update - 1.0) / num_updates

    for step in range(0, num_steps):
        global_step += num_envs

        # Getting next action and it's value
        with torch.no_grad():
            action, log_prob, _, value = agent.forward(observations)
            value = value.flatten()

        observations_, rewards, dones_, truncation_statuses, info = envs.step(action.cpu().numpy())
        # print(info)

        # Saving experience in memory
        memory.remember(
            step=step, 
            observation= observations,
            action=action,
            value=value,
            log_prob=log_prob,
            reward=torch.tensor(np.array(rewards, dtype=np.float32)).to(device).view(-1),
            done=done
        )

        # Saving new observation and done status for next step
        observations = torch.Tensor(observations_).to(device) 
        done =  torch.Tensor(dones_).to(device)
        
        if 'final_info' in info:
            for env_info in info['final_info']:
                if env_info is not None and "episode" in env_info.keys():
                    print(f"global_step={global_step}, episodic_return={env_info['episode']['r']}")

                    # Recording returns
                    returns.append(env_info['episode']['r'])

                    # Writing step debug info to TensorBoard
                    tensorboard_writer.add_scalar("charts/episodic_return", env_info["episode"]["r"], global_step)
                    tensorboard_writer.add_scalar("charts/episodic_length", env_info["episode"]["l"], global_step)
                    break

    # Checking if the current mean is higher than previous highest mean and saving the model
    current_mean_episodic_return = np.mean(returns)
    if current_mean_episodic_return > best_average_return:
        # Saving the model
        agent.save_models(f"./models/doom_ppo_agent/training_run_{start_datetime.strftime('%Y_%m_%d_%H_%M_%S')}/checkpoint_step_{global_step}")
        
        # Saving new best average return and clearing returns arrays
        best_average_return = current_mean_episodic_return
        returns.clear()

    # Training the agent
    training_stats = agent.train(
        next_observation=observations,
        next_done=done,
        observations=memory.observations,
        actions=memory.actions,
        log_probs=memory.log_probs,
        rewards=memory.rewards,
        values=memory.values,
        dones=memory.dones,
        num_steps=num_steps,
        batch_size=batch_size,
        learning_rate_anneal_coef=learning_rate_anneal_coef,
        mini_batch_size=mini_batch_size,
        num_training_epochs=num_training_epochs
    )

    print("SPS:", int(global_step / (time.time() - start_time)))

    tensorboard_writer.add_scalar("charts/learning_rate", training_stats.learning_rate, global_step)
    tensorboard_writer.add_scalar("losses/value_loss", training_stats.value_loss, global_step)
    tensorboard_writer.add_scalar("losses/policy_loss", training_stats.policy_loss, global_step)
    tensorboard_writer.add_scalar("losses/entropy_loss", training_stats.entropy_loss, global_step)
    tensorboard_writer.add_scalar("charts/old_approx_kl", training_stats.old_approx_kl, global_step)
    tensorboard_writer.add_scalar("charts/approx_kl", training_stats.approx_kl, global_step)
    tensorboard_writer.add_scalar("charts/clip_fraction", training_stats.clip_fraction, global_step)
    tensorboard_writer.add_scalar("charts/explained_variance", training_stats.explained_variance, global_step)
    tensorboard_writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


SPS: 24
Moviepy - Building video e:\Study\University of London\Semester 6\Final Project\doom-rlhf\videos\1689682808327\rl-video-episode-0.mp4.
Moviepy - Writing video e:\Study\University of London\Semester 6\Final Project\doom-rlhf\videos\1689682808327\rl-video-episode-0.mp4



                                                                

Moviepy - Done !
Moviepy - video ready e:\Study\University of London\Semester 6\Final Project\doom-rlhf\videos\1689682808327\rl-video-episode-0.mp4
global_step=300, episodic_return=[-400.]
Saving models...
Directory './models/training_run_2023_07_18_17_50_12/checkpoint_step_512' created!
Successfully saved models!
SPS: 33
Moviepy - Building video e:\Study\University of London\Semester 6\Final Project\doom-rlhf\videos\1689682808327\rl-video-episode-1.mp4.
Moviepy - Writing video e:\Study\University of London\Semester 6\Final Project\doom-rlhf\videos\1689682808327\rl-video-episode-1.mp4



                                                                

Moviepy - Done !
Moviepy - video ready e:\Study\University of London\Semester 6\Final Project\doom-rlhf\videos\1689682808327\rl-video-episode-1.mp4
global_step=600, episodic_return=[-390.]
global_step=606, episodic_return=[95.]
Saving models...
Directory './models/training_run_2023_07_18_17_50_12/checkpoint_step_768' created!
Successfully saved models!
SPS: 38
global_step=906, episodic_return=[-400.]
SPS: 42
global_step=1206, episodic_return=[-395.]
SPS: 45
global_step=1506, episodic_return=[-395.]


KeyboardInterrupt: 

In [None]:
# Saving models
agent.save_models()

In [None]:
# Closing environments
envs.close()
# Closing tensorboard writer
tensorboard_writer.close()