In [None]:
import numpy as np
import torch
from tqdm import tqdm
from procgen import ProcgenEnv

from utils.envs import make_ProcgenEnvs

from agent.discrete_ppo import PPO
from agent.models import ImpalaModel, CNNBase

In [None]:
torch.set_num_threads(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
print('making envs...')
start_level = 0
num_levels = 200
num_test_levels = 200
num_envs=32

train_envs = make_ProcgenEnvs(num_envs=num_envs,
                              env_name='coinrun',
                              start_level=start_level,
                              num_levels=num_levels,
                              distribution_mode='easy',
                              use_generated_assets=False,
                              use_backgrounds=True,
                              restrict_themes=False,
                              use_monochrome_assets=False,
                              rand_seed=0,
                              device=device)

test_envs = make_ProcgenEnvs(num_envs=1,
                             env_name='coinrun',
                             start_level=start_level + num_levels,
                             num_levels=num_test_levels,
                             distribution_mode='easy',
                             use_generated_assets=False,
                             use_backgrounds=True,
                             restrict_themes=False,
                             use_monochrome_assets=False,
                             rand_seed=0,
                             device=device)

In [None]:
# obs = train_envs.reset()
# print(train_envs.observation_space.shape)

In [None]:
def evaluate_agent(env, agent, num_levels=200):
    sum_reward = 0
    max_ep_len = 1000
    for _ in range(num_levels):
        state = env.reset()

        for t in range(1, max_ep_len + 1):
            action, _, _ = agent.select_action(state)
            state, reward, done, _ = env.step(action.detach().cpu().numpy())

            sum_reward += reward

            if done:
                break

    return sum_reward / num_levels

In [None]:
ppo_agent = PPO(state_dim=(num_envs, 3, 64, 64), action_dim=15, actor_critic_model=ImpalaModel, lr=5e-4, gamma=0.99, K_epochs=3,
                eps_clip=0.2, use_gae=True, gae_lambda=0.95, mini_batch_size=512 * num_envs // 8, use_clipped_value_loss=True, device=device)

In [None]:
num_steps = 25_000_000
update_timestep = 512
summary_freq = 10_000

time_step = 0
running_reward = 0
running_episodes = num_envs

state = train_envs.reset()
while time_step * num_envs < num_steps:

    # select action with policy
    with torch.no_grad():
        action, action_logprob, state_val = ppo_agent.select_action(state)
    next_state, reward, done, info = train_envs.step(action.detach().cpu().numpy())

    # saving reward and is_terminals
    ppo_agent.buffer.insert(state, action, action_logprob, reward, next_state, state_val, done)

    state = next_state
    time_step += 1
    running_reward += reward.sum()
    running_episodes += done.sum()

    # update PPO agent
    if time_step % update_timestep == 0:
        ppo_agent.update()

    if time_step % summary_freq == 0:
        test_ave_reward = evaluate_agent(test_envs, ppo_agent, num_levels=num_test_levels)
        print(f"Timestep: {time_step * num_envs}, \t\tAverage Train Reward: {running_reward / running_episodes: .2f}, \t\tAverage Test Reward: {test_ave_reward.item(): .2f}")

        running_reward = 0
        running_episodes = 0

    # save model weights
    # if time_step % save_model_freq == 0:
    #     print("--------------------------------------------------------------------------------------------")
    #     print("saving model at : " + checkpoint_path)
    #     ppo_agent.save(checkpoint_path)
    #     print("model saved")
    #     print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
    #     print("--------------------------------------------------------------------------------------------")

train_envs.close()