# Train PPO Agent to Play the Doom Game using a Trained Reward Predictor

In [1]:
from utils.env import make_doom_env
import gym

num_envs = 4

# Initializing environment
envs = gym.vector.SyncVectorEnv(
    [make_doom_env(level_config_path='vizdoom/scenarios/basic.cfg', render=False) for i in range(num_envs)]
)


In [2]:
from torch.utils.tensorboard import SummaryWriter
from agents.ppo_agent import PpoAgent
from agents.doom_ppo_agent import DoomPpoAgent
from reward_predictors.doom_reward_predictor import DoomRewardPredictor
from utils.memory import Memory
from utils.time import current_timestamp_ms
from datetime import datetime

# Setting up agent
agent = DoomPpoAgent(envs.single_observation_space, 
                     envs.single_action_space, 
                     learning_rate=0.0001)
reward_predictor = DoomRewardPredictor(envs.single_observation_space.shape, 
                                       1,
                                       model_path='./models/doom_reward_predictor/training_run_2023_07_09_11_59_50/checkpoint_step_30720',
                                       hidden_size=512, 
                                       learning_rate=0.001)

# Setting up agent training config
global_step = 0
start_datetime = datetime.now()
start_time = start_datetime.timestamp()
num_steps = 256
num_mini_batches = 32
num_training_epochs=10
batch_size = int(num_envs * num_steps)
mini_batch_size = batch_size // num_mini_batches
total_timesteps = 1000000
num_updates = total_timesteps // batch_size
memory = Memory(agent.device, num_steps, num_envs, envs.single_observation_space.shape, envs.single_action_space.shape)

# Setting up debugging for Tensorboard
tensorboard_writer = SummaryWriter(f"logs/doom_basic_level/ppo_agent_training_with_trained_reward_predictor_{current_timestamp_ms()}")

Loading models...
Successfully loaded models!
Updating networks with weights from loaded models...
Successfully updated networks!


In [3]:
import torch
import numpy as np
import time

observation = torch.Tensor(envs.reset()).to(agent.device)
done = torch.zeros(num_envs).to(agent.device)
best_average_return = float('-inf')
returns = []

for update in range(1, num_updates + 1):
    # Calculating learning rate annealing coefficient
    learning_rate_anneal_coef = 1.0 - (update - 1.0) / num_updates

    for step in range(0, num_steps):
        global_step += num_envs

        # Getting next action and it's value
        with torch.no_grad():
            action, log_prob, _, value = agent.get_optimal_action_and_value(observation)
            reward = reward_predictor.forward(observation, action)
            value = value.flatten()

        observation_, env_reward, done_, info = envs.step(action.cpu().numpy())
        # reward = torch.tensor(np.array(env_reward, dtype=np.float32)).to(agent.device)

        # Saving experience in memory
        memory.remember(
            step=step, 
            observation= observation,
            action=action,
            value=value,
            log_prob=log_prob,
            reward=reward.view(-1),
            done=done
        )

        # Saving new observation and done status for next step
        observation = torch.Tensor(observation_).to(agent.device) 
        done =  torch.Tensor(done_).to(agent.device)
        
        for item in info:
            if "episode" in item.keys():
                print(f"global_step={global_step}, episodic_return={item['episode']['r']}")

                # Recording returns
                returns.append(item['episode']['r'])

                # Writing step debug info to TensorBoard
                tensorboard_writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
                tensorboard_writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
                break

    # Checking if the current mean is higher than previous highest mean and saving the model
    current_mean_episodic_return = np.mean(returns)
    if current_mean_episodic_return > best_average_return:
        # Saving the model
        agent.save_models(f"./models/doom_ppo_agent/training_run_{start_datetime.strftime('%Y_%m_%d_%H_%M_%S')}/checkpoint_step_{global_step}")
        
        # Saving new best average return and clearing returns arrays
        best_average_return = current_mean_episodic_return
        returns.clear()

    # Training the agent
    training_stats = agent.train(
        next_observation=observation,
        next_done=done,
        observations=memory.observations,
        actions=memory.actions,
        log_probs=memory.log_probs,
        rewards=memory.rewards,
        values=memory.values,
        dones=memory.dones,
        num_steps=num_steps,
        batch_size=batch_size,
        learning_rate_anneal_coef=learning_rate_anneal_coef,
        mini_batch_size=mini_batch_size,
        num_training_epochs=num_training_epochs
    )

    print("SPS:", int(global_step / (time.time() - start_time)))

    tensorboard_writer.add_scalar("charts/learning_rate", training_stats.learning_rate, global_step)
    tensorboard_writer.add_scalar("losses/value_loss", training_stats.value_loss, global_step)
    tensorboard_writer.add_scalar("losses/policy_loss", training_stats.policy_loss, global_step)
    tensorboard_writer.add_scalar("losses/entropy_loss", training_stats.entropy_loss, global_step)
    tensorboard_writer.add_scalar("charts/old_approx_kl", training_stats.old_approx_kl, global_step)
    tensorboard_writer.add_scalar("charts/approx_kl", training_stats.approx_kl, global_step)
    tensorboard_writer.add_scalar("charts/clip_fraction", training_stats.clip_fraction, global_step)
    tensorboard_writer.add_scalar("charts/explained_variance", training_stats.explained_variance, global_step)
    tensorboard_writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)


global_step=8, episodic_return=95.0
global_step=24, episodic_return=79.0
global_step=48, episodic_return=72.0
global_step=124, episodic_return=-51.0
global_step=176, episodic_return=-102.0
global_step=272, episodic_return=-161.0
global_step=280, episodic_return=95.0
global_step=300, episodic_return=-355.0
global_step=308, episodic_return=95.0
global_step=316, episodic_return=95.0
global_step=328, episodic_return=91.0
global_step=396, episodic_return=20.0
global_step=424, episodic_return=-375.0
global_step=464, episodic_return=-120.0
global_step=472, episodic_return=45.0
global_step=476, episodic_return=-365.0
global_step=552, episodic_return=6.0
global_step=564, episodic_return=-9.0
global_step=568, episodic_return=87.0
global_step=576, episodic_return=91.0
global_step=644, episodic_return=-106.0
global_step=656, episodic_return=-169.0
global_step=664, episodic_return=95.0
global_step=668, episodic_return=79.0
global_step=684, episodic_return=87.0
global_step=696, episodic_return=91.0


KeyboardInterrupt: 

In [None]:
# Saving models
agent.save_models()

In [None]:
# Closing environments
envs.close()
# Closing tensorboard writer
tensorboard_writer.close()