# RLHF Trainining Pipeline for PPO Agent to Play Levels from the Doom Game

## Creating UI Elements

In [1]:
from ipywidgets import widgets
from typing import Callable
from datetime import datetime

# Function for rendering widgets
def create_html_heading(text, level=1, centered=True):
    html_tag = f'<h{level}>{text}</h{level}>'
    layout = widgets.Layout(display='flex', justify_content='center') if centered else None
    return widgets.HTML(html_tag, layout=layout)

def create_video_player(video_path: str):
    return widgets.Video.from_file(video_path)

def create_button(description, tooltip, button_style=''):
    return widgets.Button(description=description, disabled=False, button_style=button_style,
                          tooltip=tooltip, layout=widgets.Layout(display='flex', justify_content='center', margin='4px'))

def create_loading_spinner():
    css = """
    .loader {
        border: 8px solid #f3f3f3;
        border-top: 8px solid #3498db;
        border-radius: 50%;
        width: 100px;
        height: 100px;
        animation: spin 2s linear infinite;
    }

    @keyframes spin {
        0% { transform: rotate(0deg); }
        100% { transform: rotate(360deg); }
    }
    """
    # Create the custom CSS and apply it to the heading
    heading_style = widgets.HTML(f'<style>{css}</style>')
    # Create the loading spinner widget
    loading_spinner = widgets.HTML('<div class="loader"></div>', layout=widgets.Layout(display='flex', justify_content='center'))
    
    return heading_style, loading_spinner

def create_preference_selection_component(trajectory_1_video_path: str, trajectory_2_video_path: str, on_trajectory_1_chosen: Callable, on_trajectory_2_chosen: Callable, on_both_trajectories_chosen: Callable):
    # Creating GUI Components
    window_label = create_html_heading('Which trajectory do you prefer?', centered=True)
    trajectory_1_label = create_html_heading('Trajectory 1', level=2, centered=True)
    trajectory_2_label = create_html_heading('Trajectory 2', level=2, centered=True)
    trajectory_1_video_player = create_video_player(trajectory_1_video_path)
    trajectory_2_video_player = create_video_player(trajectory_2_video_path)
    prefer_trajectory_1_button = create_button('Select 1', 'You prefer Trajectory 1')
    prefer_trajectory_2_button = create_button('Select 2', 'You prefer Trajectory 2')
    prefer_both_trajectories_button = create_button('Both', 'You prefer both Trajectories')
    
    prefer_trajectory_1_button.on_click(on_trajectory_1_chosen)
    prefer_trajectory_2_button.on_click(on_trajectory_2_chosen)
    prefer_both_trajectories_button.on_click(on_both_trajectories_chosen)

    window_label.layout.flex = '1'

    # Rendering components
    preference_selection_layout = widgets.VBox([
        window_label,
        widgets.HBox([
            widgets.VBox([
                trajectory_1_label,
                trajectory_1_video_player,
                prefer_trajectory_1_button
            ], layout=widgets.Layout(align_items='center', justify_content='space-between', width='40%', height='100%')),
            widgets.VBox([
                trajectory_2_label,
                trajectory_2_video_player,
                prefer_trajectory_2_button
            ], layout=widgets.Layout(align_items='center', justify_content='space-between', width='40%', height='100%'))
        ], layout=widgets.Layout(justify_content='space-between', flex='4', width='100%')),
        widgets.HBox([
            prefer_both_trajectories_button
        ], layout=widgets.Layout(justify_content='space-between', flex='1'))
    ], layout=widgets.Layout(align_items='center', justify_content='space-between'))

    return preference_selection_layout

def create_loading_component(message):
    # Create the heading widget and apply the style
    heading = create_html_heading(message, centered=True)
    # Create the loading spinner widget
    heading_style, loading_spinner = create_loading_spinner()

    # Create the outer VBox layout with widgets
    outer_vbox = widgets.VBox([heading_style, heading, loading_spinner], layout=widgets.Layout(align_items='center', justify_content='center'))

    return outer_vbox

def create_logs_component():
    logs_label = widgets.HTML('<h1>Logs</h1>')
    logs_output = widgets.Output(description="Output")
    logs_layout = widgets.VBox([
            logs_label,
            widgets.Box([
                logs_output
            ], layout=widgets.Layout(flex='1', width='98%', overflow_y='scroll'))
        ]
    )

    return logs_layout, logs_output

logs_component, logs_output = create_logs_component()
layout = widgets.HBox([])

def log_info(message: str):
    current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    message = f"[{current_timestamp}][INFO] {message}"
    logs_output.append_stdout(f"{message}\n")
    # print(message)

def log_error(message: str):
    current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    message = f"[{current_timestamp}][ERROR] {message}\n"
    logs_output.append_stderr(message)

def hide_all_screens():
    layout.children = []

def show_preference_selection_screen(trajectory_1_video_path: str, trajectory_2_video_path: str, on_trajectory_1_chosen: Callable, on_trajectory_2_chosen: Callable, on_both_trajectories_chosen: Callable):
    preference_selection_component = create_preference_selection_component(trajectory_1_video_path, trajectory_2_video_path, on_trajectory_1_chosen, on_trajectory_2_chosen, on_both_trajectories_chosen)

    # Updating layout of components to fit the screen layout
    preference_selection_component.layout.padding = '15px' 
    preference_selection_component.layout.margin = '7.5px' 
    preference_selection_component.layout.border = '3px dashed cornflowerblue' 
    preference_selection_component.layout.flex = '1' 
    preference_selection_component.layout.height = '600px'
    
    logs_component.layout.padding = '15px'
    logs_component.layout.margin = '7.5px' 
    logs_component.layout.border = '3px dashed cornflowerblue' 
    logs_component.layout.flex = '1' 
    logs_component.layout.height = '600px'

    layout.children = [ preference_selection_component, logs_component ]

def show_loading_screen(message: str):
    loading_component = create_loading_component(message)

    # Updating layout of components to fit the screen layout
    loading_component.layout.padding = '15px'
    loading_component.layout.margin = '7.5px' 
    loading_component.layout.border = '3px dashed cornflowerblue' 
    loading_component.layout.flex = '1' 
    loading_component.layout.height = '600px'
    
    logs_component.layout.padding = '15px'
    logs_component.layout.margin = '7.5px' 
    logs_component.layout.border = '3px dashed cornflowerblue' 
    logs_component.layout.flex = '1' 
    logs_component.layout.height = '600px'

    layout.children = [ loading_component, logs_component ]

## Training Code

In [2]:
import numpy as np
import time
from utils.video import generate_video_from_doom_play_segments
import random
import time
from IPython.display import display, clear_output
import asyncio
import traceback
from datetime import datetime
from utils.replay_buffer import ReplayBuffer
from torch.utils.tensorboard import SummaryWriter
from agents.doom_ppo_agent import DoomPpoAgent
from reward_predictors.doom_human_preference_reward_predictor import DoomHumanPreferenceRewardPredictor
import gymnasium as gym

# Recording start time
start_datetime = datetime.now()
start_datetime_timestamp_str = start_datetime.strftime("%Y_%m_%d_%H_%M_%S")
start_time = start_datetime.timestamp()

def ask_user_for_preference(trajectory_1_video_path: str, trajectory_2_video_path: str):
    future = asyncio.Future()

    def on_trajectory_1_chosen(_):
        future.set_result(0)
    
    def on_trajectory_2_chosen(_):
        future.set_result(1)
    
    def on_both_trajectories_chosen(_):
        future.set_result(0.5)

    show_preference_selection_screen(trajectory_1_video_path, trajectory_2_video_path, on_trajectory_1_chosen, on_trajectory_2_chosen, on_both_trajectories_chosen)

    return future

def create_random_pairs(elements):
    # Shuffle the elements array
    shuffled_elements = elements.copy()
    random.shuffle(shuffled_elements)
    
    # Create pairs of consecutive elements
    pairs = [(shuffled_elements[i], shuffled_elements[i+1]) for i in range(0, len(shuffled_elements), 2)]
    
    return pairs

async def pre_train(envs: gym.vector.SyncVectorEnv, 
                    agent: DoomPpoAgent, 
                    reward_predictor: DoomHumanPreferenceRewardPredictor,
                    pipeline_args: dict,
                    reward_predictor_args: dict):
    # Creating replay buffer for storing transition for training
    replay_buffer = ReplayBuffer(pipeline_args.get('env_replay_buffer_size'), 
                                 envs.num_envs, 
                                 envs.envs[0].raw_observation_space, 
                                 envs.single_observation_space, 
                                 envs.single_action_space)

    # Initializing pre-training variables 
    global_step = 0
    observations, infos = envs.reset()
    terminations = [0 for _ in range(envs.num_envs)]
    current_num_preference_requests = 0
    all_preferences_collected = False

    while not all_preferences_collected:
        show_loading_screen(f"Exploring Enviroment to Pre-Train Reward Predictor...")
        
        for step in range(0, replay_buffer.num_steps_per_env):
            global_step += envs.num_envs

            # Getting next action and it's value
            actions, log_probs, probs, values = agent.forward(observations)
            values = values.flatten()
            
            # Performing actions in the environments
            observations_, _, terminations_, _, infos = envs.step(actions)

            # Predicting reward for the observations and the corresponding actions
            rewards = reward_predictor.forward(observations)

            # Saving transitions in replay buffer
            replay_buffer[step] = (
                np.stack(infos["raw_observations"]),
                observations,
                actions,
                log_probs,
                rewards,
                values,
                terminations
            )

            # Saving new observation and termination status for next step
            observations = observations_
            terminations = terminations_

        # Preparing trajectories and videos for asking user for preference
        show_loading_screen("Preparing trajectories for asking user for preference...")
        segments = replay_buffer.get_segments(segment_length=pipeline_args.get('num_frames_in_trajectory_video'))
        trajectory_pairs = create_random_pairs(segments)

        # Collecting human preferences and training
        for trajectory_1, trajectory_2 in trajectory_pairs:
            generate_video_from_doom_play_segments(trajectory_1, './temp/trajectory_1.mp4')
            generate_video_from_doom_play_segments(trajectory_2, './temp/trajectory_2.mp4')
            
            # Asking user for their preference
            user_preference = await ask_user_for_preference('./temp/trajectory_1.mp4', './temp/trajectory_2.mp4')

            # Training reward predictor based on the user preference
            log_info(f"user_preference = {user_preference}")
            show_loading_screen("Pre-training reward predictor...")
            reward_predictor_training_stats = reward_predictor.train(trajectory_1, trajectory_2, user_preference, reward_predictor_args.get('num_training_epochs'))
            log_info(f"reward_predictor_training_loss={reward_predictor_training_stats['loss']}")
            current_num_preference_requests += 1

            log_info(f"Recieved {current_num_preference_requests} preferences!")

            # Exiting the preference collection loop if the target number of requests is met
            if current_num_preference_requests >= pipeline_args.get('num_pre_train_requests'):
                all_preferences_collected = True
                break

async def train(envs: gym.vector.SyncVectorEnv, 
                agent: DoomPpoAgent, 
                reward_predictor: DoomHumanPreferenceRewardPredictor,
                pipeline_args: dict,
                agent_args: dict,
                reward_predictor_args: dict):
    # Setting up other training config 
    batch_size = int(envs.num_envs * pipeline_args.get('env_replay_buffer_size'))
    mini_batch_size = batch_size // agent_args.get('num_mini_batches')
    num_updates = pipeline_args.get('total_timesteps') // batch_size

    print(f"num_training_epochs={agent_args.get('num_training_epochs')}")

    # Creating replay buffer for storing transition for training
    replay_buffer = ReplayBuffer(pipeline_args.get('env_replay_buffer_size'), 
                                 envs.num_envs, 
                                 envs.envs[0].raw_observation_space, 
                                 envs.single_observation_space, 
                                 envs.single_action_space)

    # Setting up Tensorboard for saving training stats if requested
    if pipeline_args.get('track_stats'):
        tensorboard_writer = SummaryWriter(f"logs/doom_basic_level/rlhf_training_{start_datetime_timestamp_str}")

    # Initializing variables for tracking the training process 
    global_step = 0
    agent_training_iteration = 0 
    observations, infos = envs.reset()
    terminations = [0 for _ in range(envs.num_envs)]
    best_average_return = float("-inf")
    reward_predictor_reward_sums = np.zeros(envs.num_envs, dtype=np.float32)
    env_reward_sums = np.zeros(envs.num_envs, dtype=np.float32)
    reward_predictor_episodic_returns = []


    for update in range(1, num_updates + 1):
        show_loading_screen(f"Training PPO Agent...")
        
        for step in range(0, replay_buffer.num_steps_per_env):
            global_step += envs.num_envs

            # Getting next action and it's value
            actions, log_probs, probs, values = agent.forward(observations)
            values = values.flatten()
            
            # Performing actions in the environments
            observations_, env_rewards, terminations_, _, infos = envs.step(actions)

            # Predicting reward for the observations and the corresponding actions
            rewards = reward_predictor.forward(observations)
            reward_predictor_reward_sums = reward_predictor_reward_sums + rewards
            env_reward_sums = env_reward_sums + env_rewards

            # Saving transitions in replay buffer
            replay_buffer[step] = (
                np.stack(infos["raw_observations"]),
                observations,
                actions,
                log_probs,
                rewards,
                values,
                terminations
            )

            # Saving new observation and termination status for next step
            observations = observations_
            terminations = terminations_

            # Record episodic returns based on reward predictor rewards and environment rewards
            for index, termination in enumerate(terminations):
                if termination == 1:
                    reward_predictor_reward_sum = reward_predictor_reward_sums[index]
                    env_reward_sum = env_reward_sums[index]

                    reward_predictor_episodic_returns.append(reward_predictor_reward_sum)

                    log_info(f"global_step={global_step}, episodic_reward_predictor_return={reward_predictor_reward_sum}")
                    log_info(f"global_step={global_step}, episodic_env_return={env_reward_sum}")

                    # Writing reward stats to TensorBoard
                    if pipeline_args.get('track_stats'):
                        tensorboard_writer.add_scalar("ppo_agent/charts/episodic_reward_predictor_return", reward_predictor_reward_sum, global_step)
                        tensorboard_writer.add_scalar("ppo_agent/charts/episodic_env_return", env_reward_sum, global_step)

                    # Resetting rewards sum
                    reward_predictor_reward_sums[index] = 0
                    env_reward_sums[index] = 0

        # Calculating current mean episodic return
        current_mean_episodic_return = np.mean(reward_predictor_episodic_returns)
        reward_predictor_episodic_returns.clear()
        log_info(f"Current Mean Episodic Return = {current_mean_episodic_return}")

        # Checking if the current mean is higher than previous highest mean 
        # and if the number of steps taken exceeds the model save threshold, and then saving the model
        # if current_mean_episodic_return > best_average_return:
        # Saving the model
        model_save_path = f"./models/rlhf_pipeline/training_run_{start_datetime_timestamp_str}/doom_ppo_agent/checkpoint_step_{global_step}"
        log_info(f"Saving models to `{model_save_path}`...")
        agent.save_models(model_save_path)
        log_info(f"Successfully saved models to `{model_save_path}`!")

        # Saving new best average return and clearing returns arrays
        best_average_return = current_mean_episodic_return
            
        # Calculating learning rate annealing coefficient for the agent
        lr_anneal_coef = 1.0 - (update - 1.0) / num_updates

        # Training the agent
        agent_training_stats = agent.train(
            replay_buffer=replay_buffer,
            gamma=agent_args.get('gamma'),
            enable_gae=agent_args.get('enable_gae'),
            gae_lambda=agent_args.get('gae_lambda'),
            clip_value_loss=agent_args.get('clip_value_loss'),
            clip_coef=agent_args.get('clip_coef'),
            max_norm_grad=agent_args.get('max_norm_grad'),
            value_coef=agent_args.get('value_coef'),
            entropy_coef=agent_args.get('entropy_coef'),
            lr_anneal_coef=lr_anneal_coef,
            target_kl=agent_args.get('target_kl'),
            norm_adv=agent_args.get('norm_adv'),
            mini_batch_size=mini_batch_size,
            num_training_epochs=agent_args.get('num_training_epochs'),
        )
        agent_training_iteration += 1
        log_info(f"SPS: {int(global_step / (time.time() - start_time))}")

        if pipeline_args.get('track_stats'):
            tensorboard_writer.add_scalar("charts/learning_rate", agent_training_stats.learning_rate, global_step)
            tensorboard_writer.add_scalar("ppo_agent/losses/value_loss", agent_training_stats.value_loss, global_step)
            tensorboard_writer.add_scalar("ppo_agent/losses/policy_loss", agent_training_stats.policy_loss, global_step)
            tensorboard_writer.add_scalar("ppo_agent/losses/entropy_loss", agent_training_stats.entropy_loss, global_step)
            tensorboard_writer.add_scalar("ppo_agent/charts/old_approx_kl", agent_training_stats.old_approx_kl, global_step)
            tensorboard_writer.add_scalar("ppo_agent/charts/approx_kl", agent_training_stats.approx_kl, global_step)
            tensorboard_writer.add_scalar("ppo_agent/charts/clip_fraction", agent_training_stats.clip_fraction, global_step)
            tensorboard_writer.add_scalar("ppo_agent/charts/explained_variance", agent_training_stats.explained_variance, global_step)
            tensorboard_writer.add_scalar("ppo_agent/charts/SPS", int(global_step / (time.time() - start_time)), global_step)

        if agent_training_iteration % pipeline_args.get('training_phase_human_feedback_interval') == 0:
            # Preparing trajectories and videos for asking user for preference
            show_loading_screen("Preparing trajectories for asking user for preference...")
            segments = replay_buffer.get_segments(segment_length=pipeline_args.get('num_frames_in_trajectory_video'))

            trajectory_pairs = create_random_pairs(segments)

            for trajectory_1, trajectory_2 in trajectory_pairs:
                generate_video_from_doom_play_segments(trajectory_1, './temp/trajectory_1.mp4')
                generate_video_from_doom_play_segments(trajectory_2, './temp/trajectory_2.mp4')
                
                # Asking user for their preference
                user_preference = await ask_user_for_preference('./temp/trajectory_1.mp4', './temp/trajectory_2.mp4')
                log_info(f"user_preference = {user_preference}")

                # Training reward predictor based on the user preference
                show_loading_screen("Pre-training reward predictor...")
                reward_predictor_training_stats = reward_predictor.train(trajectory_1, trajectory_2, user_preference, reward_predictor_args.get('num_training_epochs'))
                log_info(f"reward_predictor_training_loss={reward_predictor_training_stats['loss']}")

                if pipeline_args.get('track_stats'):
                    # Writing loss value to tensorboard
                    tensorboard_writer.add_scalar("ppo_agent/losses/loss", reward_predictor_training_stats["loss"], global_step)

async def run_training_pipeline(envs: gym.vector.SyncVectorEnv, 
                                agent: DoomPpoAgent, 
                                reward_predictor: DoomHumanPreferenceRewardPredictor,
                                pipeline_args: dict,
                                agent_args: dict,
                                reward_predictor_args: dict):
    try:
        if pipeline_args.get('enable_pre_training'):
            await pre_train(envs, 
                            agent, 
                            reward_predictor, 
                            pipeline_args=pipeline_args,
                            reward_predictor_args=reward_predictor_args)
        await train(envs, 
                    agent, 
                    reward_predictor, 
                    pipeline_args,
                    agent_args,
                    reward_predictor_args)
    except Exception as error:
                # Catch the error and print the entire stack trace
                traceback.print_exc()


## Running Training Pipeline

### Arguments

In [3]:
pipeline_args = {
    'env_cfg': 'envs/vizdoom/scenarios/basic.cfg',
    'total_timesteps': 300000,
    'render_env': True,
    'model_save_threshold': 4000,
    'enable_gpu': True,
    'track_stats': True,
    'num_envs': 8,
    'env_replay_buffer_size': 256,
    'enable_pre_training': False,
    'num_pre_train_requests': 250,
    'num_frames_in_trajectory_video': 64,
    'training_phase_human_feedback_interval': 100
}

agent_args = {
    'learning_rate': 0.0001,
    'anneal_lr': True,
    'enable_gae': True,
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'num_mini_batches': 32,
    'num_training_epochs': 10,
    'norm_adv': True,
    'clip_coef': 0.1,
    'clip_value_loss': True,
    'entropy_coef': 0.01,
    'value_coef': 0.5,
    'max_norm_grad': 0.5,
    'target_kl': None
}

reward_predictor_args = {
    'learning_rate': 1e-4,
    'hidden_layer_size': 64,
    'num_training_epochs': 1,
    'model_path': './models/rlhf_pipeline/training_run_2023_08_31_14_33_02/doom_reward_predictor/pre_training'
}

In [4]:
%gui asyncio

from utils.env import make_vizdoom_env

# Displaying UI and showing only the loading screen
clear_output()
display(layout)
hide_all_screens()

# Initializing environments with only the first enviroment rendering
num_envs = pipeline_args.get('num_envs')
envs = gym.vector.SyncVectorEnv([ make_vizdoom_env('envs/vizdoom/scenarios/basic.cfg', render_mode="human" if i == 0 else None) for i in range(num_envs)])

# Setting up agent and reward predictor
agent = DoomPpoAgent(envs.single_observation_space,
                     envs.single_action_space,
                     learning_rate=agent_args.get('learning_rate'),
                     use_gpu=pipeline_args.get('enable_gpu'))
reward_predictor = DoomHumanPreferenceRewardPredictor(envs.single_observation_space,
                                                      hidden_layer_size=reward_predictor_args.get('hidden_layer_size'), 
                                                      learning_rate=reward_predictor_args.get('learning_rate'),
                                                      use_gpu=pipeline_args.get('enable_gpu'))

if reward_predictor_args.get('model_path'):
    reward_predictor.load_models(reward_predictor_args.get('model_path'))

# Starting training process
event_loop = asyncio.get_event_loop()
training_task = event_loop.create_task(run_training_pipeline(envs, 
                                                             agent, 
                                                             reward_predictor,
                                                             pipeline_args=pipeline_args,
                                                             agent_args=agent_args,
                                                             reward_predictor_args=reward_predictor_args))

HBox()

  logger.warn(


num_training_epochs=10


  logger.warn(
