# RLHF Trainining Pipeline for PPO Agent to Play Levels from the Doom Game

In [1]:
from utils.env import make_vizdoom_env
import gymnasium as gym
from datetime import datetime

start_datetime = datetime.now()
start_datetime_timestamp_str = start_datetime.strftime("%Y_%m_%d_%H_%M_%S")

# Initializing environments
num_envs = 8
envs = gym.vector.SyncVectorEnv(
    [
        make_vizdoom_env(
            "envs/vizdoom/scenarios/basic.cfg",
            #  render_mode='rgb_array',
            #  record_episodes=True,
            #  recording_save_path=f"./temp",
            #  recording_file_prefix=f"rl-pipeline-env-{i}"
        )
        for i in range(num_envs)
    ]
)

  logger.warn(


In [2]:
from torch.utils.tensorboard import SummaryWriter
from agents.doom_ppo_agent import DoomPpoAgent
from reward_predictors.cnn_reward_predictor import CnnRewardPredictor
from utils.replay_buffer import ReplayBuffer
from utils.time import current_timestamp_ms

# Setting up agent and reward predictor
agent = DoomPpoAgent(
    envs.single_observation_space,
    envs.single_action_space,
    learning_rate=0.0001,
    use_gpu=True,
)
reward_predictor = CnnRewardPredictor(envs.single_observation_space.shape, 
                                      envs.single_action_space.n if isinstance(envs.single_action_space, gym.spaces.Discrete) else envs.single_action_space.shape, 
                                      use_gpu=True)

# Agent training config
global_step = 0
start_time = start_datetime.timestamp()
num_steps = 256
num_mini_batches = 32
num_training_epochs = 10
batch_size = int(num_envs * num_steps)
mini_batch_size = batch_size // num_mini_batches

num_batches_before_collecting_feedback = 1

# Creating replay buffer for storing steps
replay_buffer = ReplayBuffer(
    num_steps, num_envs, envs.single_observation_space, envs.single_action_space
)

# Setting up debugging for Tensorboard
tensorboard_writer = SummaryWriter(
    f"logs/ppo_agent/doom_basic_level/rlhf_training_{current_timestamp_ms()}"
)

In [3]:
import numpy as np
import time

observations, info = envs.reset()
dones = [0 for _ in range(num_envs)]
best_average_return = float("-inf")
returns = []
reward_sums = np.zeros(num_envs, dtype=np.float32)

while True:
    # Calculating learning rate annealing coefficient
    # learning_rate_anneal_coef = 1.0 - (update - 1.0) / num_updates

    for step in range(0, num_steps):
        global_step += num_envs

        # Getting next action and it's value
        actions, log_probs, probs, values = agent.forward(observations)
        values = values.flatten()
        
        # Performing actions in the environments
        observations_, _, dones_, truncation_statuses, info = envs.step(actions)

        # Predicting reward for the observations and the corresponding actions
        rewards = reward_predictor.forward(observations, actions)
        reward_sums = reward_sums + rewards

        # Saving transitions in replay buffer
        replay_buffer[step] = (observations, actions, log_probs, rewards, values, dones)

        # Saving new observation and done status for next step
        observations = observations_
        dones = dones_

        # Record episodic returns
        for index, done in enumerate(dones):
            if done == 1:
                reward_sum = reward_sums[index]
                returns.append(reward_sums[index])
                print(f"global_step={global_step}, episodic_return={reward_sums[index]}")

                # Resetting rewards sum
                reward_sums[index] = 0

    # Checking if the current mean is higher than previous highest mean and saving the model
    current_mean_episodic_return = np.mean(returns)
    print(f"Current Mean Episodic Return = {current_mean_episodic_return}")
    if current_mean_episodic_return > best_average_return:
        # Saving the model
        agent.save_models(
            f"./models/doom_ppo_agent/rl_pipeline/training_run_{start_datetime_timestamp_str}/checkpoint_step_{global_step}"
        )

        # Saving new best average return and clearing returns arrays
        best_average_return = current_mean_episodic_return
        returns.clear()

    # Training the agent
    training_stats = agent.train(
        replay_buffer=replay_buffer,
        # learning_rate_anneal_coef=learning_rate_anneal_coef,
        mini_batch_size=mini_batch_size,
        num_training_epochs=num_training_epochs,
    )

    print("SPS:", int(global_step / (time.time() - start_time)))

    # tensorboard_writer.add_scalar("charts/learning_rate", training_stats.learning_rate, global_step)
    tensorboard_writer.add_scalar("losses/value_loss", training_stats.value_loss, global_step)
    tensorboard_writer.add_scalar("losses/policy_loss", training_stats.policy_loss, global_step)
    tensorboard_writer.add_scalar("losses/entropy_loss", training_stats.entropy_loss, global_step)
    tensorboard_writer.add_scalar("charts/old_approx_kl", training_stats.old_approx_kl, global_step)
    tensorboard_writer.add_scalar("charts/approx_kl", training_stats.approx_kl, global_step)
    tensorboard_writer.add_scalar("charts/clip_fraction", training_stats.clip_fraction, global_step)
    tensorboard_writer.add_scalar("charts/explained_variance", training_stats.explained_variance, global_step)
    tensorboard_writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

observations.shape = (8, 4, 120, 160)
observations.dtype = uint8
observations.shape = torch.Size([8, 4, 120, 160])
observations.dtype = torch.uint8


: 

: 