# RLHF Trainining Pipeline for PPO Agent to Play Levels from the Doom Game

In [1]:
from datetime import datetime

start_datetime = datetime.now()
start_datetime_timestamp_str = start_datetime.strftime("%Y_%m_%d_%H_%M_%S")

# Agent training config
start_time = start_datetime.timestamp()
num_steps = 256
num_mini_batches = 32
num_training_epochs = 10
num_envs = 8
batch_size = int(num_envs * num_steps)
mini_batch_size = batch_size // num_mini_batches

# Reward predictor training config
num_batches_before_collecting_feedback = 1

In [2]:
from torch.utils.tensorboard import SummaryWriter
from agents.doom_ppo_agent import DoomPpoAgent
from reward_predictors.doom_human_preference_reward_predictor import DoomHumanPreferenceRewardPredictor
from utils.time import current_timestamp_ms
import gymnasium as gym
from utils.replay_buffer import ReplayBuffer
from utils.env import make_vizdoom_env

# Initializing environments
envs = gym.vector.SyncVectorEnv([ make_vizdoom_env('envs/vizdoom/scenarios/basic.cfg') for i in range(num_envs)])

# Setting up agent and reward predictor
agent = DoomPpoAgent(envs.single_observation_space,
                     envs.single_action_space,
                     learning_rate=0.0001,
                     use_gpu=True)
reward_predictor = DoomHumanPreferenceRewardPredictor(envs.single_observation_space, 
                                                      envs.single_action_space, 
                                                      use_gpu=True)

# Creating replay buffer for storing steps
replay_buffer = ReplayBuffer(num_steps, 
                             num_envs, 
                             envs.envs[0].raw_observation_space, 
                             envs.single_observation_space, 
                             envs.single_action_space)

# Setting up debugging for Tensorboard
tensorboard_writer = SummaryWriter(f"logs/doom_basic_level/rlhf_training_{start_datetime_timestamp_str}")

  logger.warn(
  logger.warn(


In [3]:
from ipywidgets import widgets
from IPython.display import display, clear_output
from typing import Callable

# Function for rendering widgets
def create_html_heading(text, level=1, centered=True):
    html_tag = f'<h{level}>{text}</h{level}>'
    layout = widgets.Layout(display='flex', justify_content='center') if centered else None
    return widgets.HTML(html_tag, layout=layout)

def create_video_player(video_path: str):
    return widgets.Video()
    return widgets.Video.from_file(video_path)

def create_button(description, tooltip):
    return widgets.Button(description=description, disabled=False, button_style='',
                          tooltip=tooltip, layout=widgets.Layout(display='flex', justify_content='center'))

def create_loading_spinner():
    css = """
    .loader {
        border: 8px solid #f3f3f3;
        border-top: 8px solid #3498db;
        border-radius: 50%;
        width: 100px;
        height: 100px;
        animation: spin 2s linear infinite;
    }

    @keyframes spin {
        0% { transform: rotate(0deg); }
        100% { transform: rotate(360deg); }
    }
    """
    # Create the custom CSS and apply it to the heading
    heading_style = widgets.HTML(f'<style>{css}</style>')
    # Create the loading spinner widget
    loading_spinner = widgets.HTML('<div class="loader"></div>', layout=widgets.Layout(display='flex', justify_content='center'))
    
    return heading_style, loading_spinner

def create_preference_selection_screen(trajectory_1_video_path: str, trajectory_2_video_path: str, on_trajectory_1_chosen: Callable, on_trajectory_2_chosen: Callable, on_both_trajectories_chosen: Callable):
    # Creating GUI Components
    window_label = create_html_heading('Which trajectory do you prefer?', centered=True)
    trajectory_1_label = create_html_heading('Trajectory 1', level=2, centered=True)
    trajectory_2_label = create_html_heading('Trajectory 2', level=2, centered=True)
    trajectory_1_video_player = create_video_player(trajectory_1_video_path)
    trajectory_2_video_player = create_video_player(trajectory_2_video_path)
    prefer_trajectory_1_button = create_button('Select 1', 'Choose Trajectory 1')
    prefer_trajectory_2_button = create_button('Select 2', 'Choose Trajectory 2')
    prefer_both_trajectories_button = create_button('Both', 'Choose both Trajectories')
    
    prefer_trajectory_1_button.on_click(on_trajectory_1_chosen)
    prefer_trajectory_2_button.on_click(on_trajectory_2_chosen)
    prefer_both_trajectories_button.on_click(on_both_trajectories_chosen)

    # Rendering components
    preference_selection_layout = widgets.VBox([
        window_label,
        widgets.HBox([
            widgets.VBox([
                trajectory_1_label,
                trajectory_1_video_player,
                prefer_trajectory_1_button
            ], layout=widgets.Layout(align_items='center', justify_content='space-between')),
            widgets.VBox([
                trajectory_2_label,
                trajectory_2_video_player,
                prefer_trajectory_2_button
            ], layout=widgets.Layout(align_items='center', justify_content='space-between'))
        ], layout=widgets.Layout(justify_content='space-between', width='700px', height='300px')),
        widgets.HBox([
            prefer_both_trajectories_button
        ])
    ], layout=widgets.Layout(align_items='center', padding='15px'))

    return preference_selection_layout

def create_loading_screen(message):
    # Create the heading widget and apply the style
    heading = create_html_heading(message, centered=True)
    # Create the loading spinner widget
    heading_style, loading_spinner = create_loading_spinner()

    # Create the outer VBox layout with widgets
    outer_vbox = widgets.VBox([heading_style, heading, loading_spinner], layout=widgets.Layout(align_items='center', justify_content='center', padding='15px', height='435px'))

    return outer_vbox

loading_screen = create_loading_screen('Training agent...')
preference_selection_screen = create_preference_selection_screen('', '', lambda _: print("Trajectory 1 chosen"), lambda _: print("Trajectory 2 chosen"), lambda _: print("Both trajectories chosen"))
layout = widgets.VBox([])

def hide_all_screens():
    layout.children = []

def show_preference_selection_screen():
    layout.children = [ preference_selection_screen ]

def show_loading_screen():
    layout.children = [ loading_screen ]

# display(layout)
# hide_all_screens()
# show_loading_screen()
# time.sleep(3)
# show_preference_selection_screen()

In [3]:
import numpy as np
import time
from utils.video import generate_video_from_doom_play_segments
import random

global_step = 0
observations, infos = envs.reset()
terminations = [0 for _ in range(num_envs)]
best_average_return = float("-inf")
reward_sums = np.zeros(num_envs, dtype=np.float32)
returns = []
segments = []

while True:
    # Calculating learning rate annealing coefficient
    # learning_rate_anneal_coef = 1.0 - (update - 1.0) / num_updates

    for step in range(0, num_steps):
        global_step += num_envs

        # Getting next action and it's value
        actions, log_probs, probs, values = agent.forward(observations)
        values = values.flatten()
        
        # Performing actions in the environments
        observations_, _, terminations_, _, infos = envs.step(actions)

        # Predicting reward for the observations and the corresponding actions
        rewards = reward_predictor.forward(observations, actions)
        reward_sums = reward_sums + rewards

        # Saving transitions in replay buffer
        replay_buffer[step] = (
            np.stack(infos["raw_observations"]),
            observations,
            actions,
            log_probs,
            rewards,
            values,
            terminations
        )

        # Saving new observation and done status for next step
        observations = observations_
        terminations = terminations_

        # Record episodic returns
        for index, done in enumerate(terminations):
            if done == 1:
                reward_sum = reward_sums[index]
                returns.append(reward_sums[index])
                print(f"global_step={global_step}, episodic_return={reward_sums[index]}")

                # Resetting rewards sum
                reward_sums[index] = 0

    # Saving a random segment
    segments = replay_buffer.get_episodic_segments(300)
    generate_video_from_doom_play_segments(random.choice(segments), './temp/trajectory-1.mp4', 30)

    # Checking if the current mean is higher than previous highest mean and saving the model
    current_mean_episodic_return = np.mean(returns)
    print(f"Current Mean Episodic Return = {current_mean_episodic_return}")
    if current_mean_episodic_return > best_average_return:
        # Saving the model
        agent.save_models(
            f"./models/rlhf_pipeline/training_run_{start_datetime_timestamp_str}/doom_ppo_agent/checkpoint_step_{global_step}"
        )

        # Saving new best average return and clearing returns arrays
        best_average_return = current_mean_episodic_return
        returns.clear()

    # Training the agent
    training_stats = agent.train(
        replay_buffer=replay_buffer,
        # learning_rate_anneal_coef=learning_rate_anneal_coef,
        mini_batch_size=mini_batch_size,
        num_training_epochs=num_training_epochs,
    )

    print("SPS:", int(global_step / (time.time() - start_time)))

    # tensorboard_writer.add_scalar("charts/learning_rate", training_stats.learning_rate, global_step)
    tensorboard_writer.add_scalar("ppo_agent/losses/value_loss", training_stats.value_loss, global_step)
    tensorboard_writer.add_scalar("ppo_agent/losses/policy_loss", training_stats.policy_loss, global_step)
    tensorboard_writer.add_scalar("ppo_agent/losses/entropy_loss", training_stats.entropy_loss, global_step)
    tensorboard_writer.add_scalar("ppo_agent/charts/old_approx_kl", training_stats.old_approx_kl, global_step)
    tensorboard_writer.add_scalar("ppo_agent/charts/approx_kl", training_stats.approx_kl, global_step)
    tensorboard_writer.add_scalar("ppo_agent/charts/clip_fraction", training_stats.clip_fraction, global_step)
    tensorboard_writer.add_scalar("ppo_agent/charts/explained_variance", training_stats.explained_variance, global_step)
    tensorboard_writer.add_scalar("ppo_agent/charts/SPS", int(global_step / (time.time() - start_time)), global_step)

global_step=64, episodic_return=1.3516228199005127
global_step=80, episodic_return=2.1228954792022705
global_step=88, episodic_return=2.26633620262146
global_step=96, episodic_return=2.9798216819763184
global_step=144, episodic_return=0.7902706265449524
global_step=152, episodic_return=2.3844170570373535
global_step=160, episodic_return=1.7644915580749512
global_step=320, episodic_return=4.686269283294678
global_step=376, episodic_return=1.3535767793655396
global_step=400, episodic_return=4.815674304962158
global_step=480, episodic_return=2.5093533992767334
global_step=504, episodic_return=3.564544916152954
global_step=544, episodic_return=1.4694359302520752
global_step=568, episodic_return=1.7664660215377808
global_step=1224, episodic_return=16.17155647277832
global_step=1272, episodic_return=1.324235439300537
global_step=2032, episodic_return=50.41633224487305
Current Mean Episodic Return = 5.984547138214111
Saving models...
Directory './models/rlhf_pipeline/training_run_2023_07_31_1

KeyboardInterrupt: 

In [5]:
import cv2
from matplotlib import pyplot as plt

# observations = random.choice(segments).observations
# plt.imshow(cv2.cvtColor(observations[0][0], cv2.COLOR_GRAY2RGB))
generate_video_from_doom_play_segments(random.choice(segments), './temp/trajectory-1.mp4', 30)