In [16]:
from utils.env import make_doom_env
import gym

# Initializing environment
envs = gym.vector.AsyncVectorEnv(
    [make_doom_env(level_config_path='vizdoom/scenarios/basic.cfg')]
)


In [9]:
# Testing observations by rendering  
from matplotlib import pyplot as plt
import cv2

frame = envs.reset();
plt.imshow(cv2.cvtColor(frame[0], cv2.COLOR_BGR2RGB))

In [3]:
from torch.utils.tensorboard import SummaryWriter
from agents.ppo_agent import PpoAgent
from utils.memory import Memory
from utils.time import current_timestamp_ms
import time

# Setting up agent
agent = PpoAgent(envs.observation_space, envs.action_space)

# Setting up agent training config
global_step = 0
start_time = time.time()
num_envs = envs.num_envs
num_steps = 128
num_mini_batches = 4
batch_size = int(num_envs * num_steps)
mini_batch_size = int(batch_size / num_mini_batches)
total_timesteps = 10000000
num_updates = total_timesteps / batch_size
memory = Memory(agent.device, num_steps, num_envs, envs.observation_space.shape, envs.action_space.shape)

# Setting up debugging for Tensorboard
tensorboard_writer = SummaryWriter(f"runs/run_{current_timestamp_ms()}")

TypeError: unsupported operand type(s) for /: 'tuple' and 'int'

In [4]:
import torch

observation = torch.Tensor(envs.reset()).to(agent.device)
done = torch.zeros(num_envs).to(agent.device)

for update in range(1, num_updates + 1):
    # Calculating learning rate annealing coefficient
    learning_rate_anneal_coef = 1.0 - (update - 1.0) / num_updates

    for step in range(0, num_steps):
        global_step += num_envs

        # Getting next action and it's value
        with torch.no_grad():
            action, log_prob, _, value = agent.get_optimal_action_and_value(observation)
            value = value.flatten()

        observation_, reward, done_, info = envs.step(action.cpu().numpy())

        # Saving experience in memory
        memory.remember(
            step=step, 
            observation= observation,
            action=action,
            log_prob=log_prob,
            reward=torch.tensor(reward).to(agent.device).view(-1),
            done=done
        )

        # Saving new observation and done status for next step
        observation, done = torch.Tensor(observation_).to(agent.device), torch.Tensor(done_).to(agent.device)

        # Writing step debug info to TensorBoard
        for item in info:
                if "episode" in item.keys():
                    print(f"global_step={global_step}, episodic_return={item['episode']['r']}")
                    tensorboard_writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
                    tensorboard_writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
                    break

        training_stats = agent.train(
            memory.observations,
            memory.actions,
            memory.log_probs,
            memory.rewards,
            memory.values,
            memory.dones,
            batch_size
        )

        print("SPS:", int(global_step / (time.time() - start_time)))

        tensorboard_writer.add_scalar("charts/learning_rate", training_stats.learning_rate, global_step)
        tensorboard_writer.add_scalar("losses/value_loss", training_stats.value_loss, global_step)
        tensorboard_writer.add_scalar("losses/policy_loss", training_stats.policy_loss, global_step)
        tensorboard_writer.add_scalar("losses/entropy_loss", training_stats.entropy_loss, global_step)
        tensorboard_writer.add_scalar("charts/old_approx_kl", training_stats.old_approx_kl, global_step)
        tensorboard_writer.add_scalar("charts/approx_kl", training_stats.approx_kl, global_step)
        tensorboard_writer.add_scalar("charts/clip_fraction", training_stats.clip_fraction, global_step)
        tensorboard_writer.add_scalar("charts/explained_variance", training_stats.explained_variance, global_step)
        tensorboard_writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)


creating actor network...
Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

successfully created actor network!
creating critic network...




successfully created critic network!
creating optimizer for actor network...
successfully created optimizer for actor network!
creating optimizer for critic network...
successfully created optimizer for critic network!


In [19]:
# Closing environments
envs.close()
# Closing tensorboard writer
tensorboard_writer.close()