In [1]:
from utils.env import make_doom_env
import gym

num_envs = 8

# Initializing environment
envs = gym.vector.SyncVectorEnv(
    [make_doom_env(level_config_path='vizdoom/scenarios/basic.cfg', render=False) for i in range(num_envs)]
)


In [2]:
# Testing observations by rendering  
# from matplotlib import pyplot as plt
# import cv2

# frame = envs.reset();
# plt.imshow(cv2.cvtColor(frame[0], cv2.COLOR_BGR2RGB))

from gym.spaces import Discrete

# print(isinstance(envs.single_action_space, Discrete))
print(envs.reset().shape)

(8, 4, 120, 160)


In [2]:
from torch.utils.tensorboard import SummaryWriter
from agents.ppo_agent import PpoAgent
from utils.memory import Memory
from utils.time import current_timestamp_ms
import time

# Setting up agent
agent = PpoAgent(envs.single_observation_space, envs.single_action_space, learning_rate=0.0001)

# Setting up agent training config
global_step = 0
start_time = time.time()
num_steps = 256
num_mini_batches = 32
num_training_epochs=10
batch_size = int(num_envs * num_steps)
mini_batch_size = batch_size // num_mini_batches
total_timesteps = 10000000
num_updates = total_timesteps // batch_size
memory = Memory(agent.device, num_steps, num_envs, envs.single_observation_space.shape, envs.single_action_space.shape)

# Setting up debugging for Tensorboard
tensorboard_writer = SummaryWriter(f"logs/doom_basic_level_training_log_{current_timestamp_ms()}")

In [3]:
import torch
import numpy as np

observation = torch.Tensor(envs.reset()).to(agent.device)
done = torch.zeros(num_envs).to(agent.device)

for update in range(1, num_updates + 1):
    # Calculating learning rate annealing coefficient
    learning_rate_anneal_coef = 1.0 - (update - 1.0) / num_updates

    for step in range(0, num_steps):
        global_step += num_envs

        # Getting next action and it's value
        with torch.no_grad():
            action, log_prob, _, value = agent.get_optimal_action_and_value(observation)
            value = value.flatten()

        observation_, reward, done_, info = envs.step(action.cpu().numpy())

        # Saving experience in memory
        memory.remember(
            step=step, 
            observation= observation,
            action=action,
            value=value,
            log_prob=log_prob,
            reward=torch.tensor(np.array(reward, dtype=np.float32)).to(agent.device).view(-1),
            done=done
        )

        # Saving new observation and done status for next step
        observation = torch.Tensor(observation_).to(agent.device) 
        done =  torch.Tensor(done_).to(agent.device)

        # Writing step debug info to TensorBoard
        for item in info:
            if "episode" in item.keys():
                print(f"global_step={global_step}, episodic_return={item['episode']['r']}")
                tensorboard_writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
                tensorboard_writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
                break

    training_stats = agent.train(
        next_observation=observation,
        next_done=done,
        observations=memory.observations,
        actions=memory.actions,
        log_probs=memory.log_probs,
        rewards=memory.rewards,
        values=memory.values,
        dones=memory.dones,
        num_steps=num_steps,
        batch_size=batch_size,
        learning_rate_anneal_coef=learning_rate_anneal_coef,
        mini_batch_size=mini_batch_size,
        num_training_epochs=num_training_epochs
    )

    print("SPS:", int(global_step / (time.time() - start_time)))

    tensorboard_writer.add_scalar("charts/learning_rate", training_stats.learning_rate, global_step)
    tensorboard_writer.add_scalar("losses/value_loss", training_stats.value_loss, global_step)
    tensorboard_writer.add_scalar("losses/policy_loss", training_stats.policy_loss, global_step)
    tensorboard_writer.add_scalar("losses/entropy_loss", training_stats.entropy_loss, global_step)
    tensorboard_writer.add_scalar("charts/old_approx_kl", training_stats.old_approx_kl, global_step)
    tensorboard_writer.add_scalar("charts/approx_kl", training_stats.approx_kl, global_step)
    tensorboard_writer.add_scalar("charts/clip_fraction", training_stats.clip_fraction, global_step)
    tensorboard_writer.add_scalar("charts/explained_variance", training_stats.explained_variance, global_step)
    tensorboard_writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)


global_step=16, episodic_return=95.0
global_step=32, episodic_return=87.0
global_step=64, episodic_return=66.0
global_step=240, episodic_return=-23.0
global_step=256, episodic_return=95.0
global_step=288, episodic_return=-62.0
global_step=304, episodic_return=95.0
global_step=528, episodic_return=-226.0
global_step=600, episodic_return=-370.0
global_step=616, episodic_return=95.0
global_step=632, episodic_return=95.0
global_step=664, episodic_return=-355.0
global_step=696, episodic_return=71.0
global_step=704, episodic_return=54.0
global_step=752, episodic_return=47.0
global_step=768, episodic_return=12.0
global_step=856, episodic_return=-365.0
global_step=888, episodic_return=-10.0
global_step=904, episodic_return=-375.0
global_step=912, episodic_return=6.0
global_step=936, episodic_return=87.0
global_step=984, episodic_return=46.0
global_step=1000, episodic_return=95.0
global_step=1032, episodic_return=87.0
global_step=1064, episodic_return=87.0
global_step=1096, episodic_return=-9.0

In [None]:
# Saving models
agent.save_models()

In [4]:
# Closing environments
envs.close()
# Closing tensorboard writer
tensorboard_writer.close()