In [1]:
import copy
import glob
import os
import time
from collections import deque

import numpy as np
import torch

import algo

#from arguments import get_args
from envs import make_vec_envs
from models import create_policy
from rollout_storage import RolloutStorage


In [2]:
arg_algo = 'a2c'
load = True

arg_num_steps = 10
arg_num_processes = 1
arg_num_frames = 5e7
arg_lr_schedule = 500000
arg_seed = 1
arg_cuda = True

arg_log_dir = '/tmp/gym/'
arg_save_dir = '/trained_models/'

arg_load_path = './trained_models/a2c/PommeFFACompetitionFast-v0.pt'

arg_env_name = 'PommeFFACompetitionFast-v0'

arg_lr = 2.5e-4
arg_eps = 1e-5
arg_alpha = 0.99
arg_gamma = 0.9
arg_tau = 0.95
arg_no_norm = True
arg_num_stack = 1
arg_clip_param = 0.2

arg_value_loss_coef = 0.5
arg_loss_coef = 0.5
arg_entropy_coef = 0.01

arg_use_gae = True

arg_max_grad_norm = 0.5
arg_log_interval = 10
arg_save_interval = 100
arg_eval_interval = 1000

In [3]:
def train():
    assert arg_algo in ['a2c', 'ppo']

    update_factor = arg_num_steps * arg_num_processes
    num_updates = int(arg_num_frames) // update_factor
    lr_update_schedule = None if arg_lr_schedule is None else arg_lr_schedule // update_factor

    torch.manual_seed(arg_seed)
    if arg_cuda:
        torch.cuda.manual_seed(arg_seed)
    np.random.seed(arg_seed)

    try:
        os.makedirs(arg_log_dir)
    except OSError:
        files = glob.glob(os.path.join(arg_log_dir, '*.monitor.csv'))
        try:
            for f in files:
                os.remove(f)
        except:
            pass

    eval_log_dir = arg_log_dir + "_eval"
    try:
        os.makedirs(eval_log_dir)
    except OSError:
        files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv'))
        try:
            for f in files:
                os.remove(f)
        except:
            pass

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if arg_cuda else "cpu")

    train_envs = make_vec_envs(
        arg_env_name, arg_seed, arg_num_processes, arg_gamma, arg_no_norm, arg_num_stack,
        arg_log_dir, device, allow_early_resets=False)

    if arg_eval_interval:
        eval_envs = make_vec_envs(
            arg_env_name, arg_seed + arg_num_processes, arg_num_processes, arg_gamma,
            arg_no_norm, arg_num_stack, eval_log_dir, device,
            allow_early_resets=True, eval=True)

        if eval_envs.venv.__class__.__name__ == "VecNormalize":
            eval_envs.venv.ob_rms = train_envs.venv.ob_rms
    else:
        eval_envs = None

    actor_critic = create_policy(
        train_envs.observation_space,
        nn_kwargs={
            'batch_norm': True,
            'hidden_size': 512,
        },
        train=True)
    if arg_load_path and load:
        print("Loading in previous model")
        try:
            path_ = "./trained_models/a2c/PommeFFACompetitionFast-v0.pt"
            if arg_algo.startswith('ppo'):
                path_ = "./trained_models/ppo/PommeFFACompetitionFast-v0.pt"

            state_dict, ob_rms = torch.load(path_)
            actor_critic.load_state_dict(state_dict)
        except:
            print("Wrong path!")
            exit(1)
    actor_critic.to(device)

    if arg_algo.startswith('a2c'):
        agent = algo.A2C(
            actor_critic, arg_value_loss_coef,
            arg_entropy_coef,
            lr=arg_lr, lr_schedule=lr_update_schedule,
            eps=arg_eps, alpha=arg_alpha,
            max_grad_norm=arg_max_grad_norm)
    elif arg_algo.startswith('ppo'):
        agent = algo.OUR_PPO(  # PPO HER!
            actor_critic, arg_clip_param, arg_ppo_epoch, arg_num_mini_batch,
            arg_value_loss_coef, arg_entropy_coef,
            lr=arg_lr, lr_schedule=lr_update_schedule,
            eps=arg_eps,
            max_grad_norm=arg_max_grad_norm)

    rollouts = RolloutStorage(
        arg_num_steps, arg_num_processes,
        train_envs.observation_space.shape,
        train_envs.action_space)

    obs = train_envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(arg_num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = train_envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    rew = info['episode']['r']
                    episode_rewards.append(rew)

            # If done then clean the history of observations.
            masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], device=device)
            rollouts.insert(obs, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, arg_use_gae, arg_gamma, arg_tau)

        value_loss, action_loss, dist_entropy, other_metrics = agent.update(rollouts, j)

        rollouts.after_update()

        if j % arg_save_interval == 0 and arg_save_dir != "":
            save_path = os.path.join(arg_save_dir, arg_algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # Save model
            save_model = actor_critic
            if arg_cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model.state_dict(),
                          hasattr(train_envs.venv, 'ob_rms') and train_envs.venv.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, arg_env_name + ".pt"))

        total_num_steps = (j + 1) * update_factor

        if j % arg_log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {}, last {} mean/median reward {:.1f}/{:.1f}, "
                  "min / max reward {:.1f}/{:.1f}, value/action loss {:.5f}/{:.5f}".
                  format(j, total_num_steps,
                         int(total_num_steps / (end - start)),
                         len(episode_rewards),
                         np.mean(episode_rewards),
                         np.median(episode_rewards),
                         np.min(episode_rewards),
                         np.max(episode_rewards), dist_entropy,
                         value_loss, action_loss), end=', ' if other_metrics else '\n')
            with open("train_results_" + arg_algo + ".txt", "a+") as res_file:
                to_print = "{},{}\n".format(total_num_steps, np.mean(episode_rewards))
                res_file.write(to_print)

        if arg_eval_interval and len(episode_rewards) > 1 and j > 0 and j % arg_eval_interval == 0:
            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_masks = torch.zeros(arg_num_processes, 1, device=device)

            while len(eval_episode_rewards) < 50:
                with torch.no_grad():
                    _, action, _ = actor_critic.act(
                        obs, eval_masks, deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], device=device)
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])
            with open("eval_results_" + arg_algo + ".txt", "a+") as res_file:
                to_print = "{},{}\n".format(total_num_steps, np.mean(eval_episode_rewards))
                res_file.write(to_print)
            print("Evaluation using {} episodes: mean reward {:.5f}\n".format(len(eval_episode_rewards),
                                                                              np.mean(eval_episode_rewards)))



In [4]:
# If you want to train the model, here it is:
# train()

In [5]:
import argparse
import torch
from models.factory import create_policy
from envs import make_vec_envs

import SearchAgent_2 as SearchAgent
import pommerman
from pommerman import agents

In [6]:
arg_hide = False

In [7]:
def render():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if arg_cuda else "cpu")

    num_env = 1
    env = make_vec_envs(arg_env_name, arg_seed + 1000,
                        num_env, gamma=None, no_norm=arg_no_norm,
                        num_stack=arg_num_stack, log_dir=None,
                        device=device, eval=True, allow_early_resets=False)

    # Get a render function
    render_func = None
    tmp_env = env
    while True:
        if hasattr(tmp_env, 'envs'):
            render_func = tmp_env.envs[0].render
            break
        elif hasattr(tmp_env, 'venv'):
            tmp_env = tmp_env.venv
        elif hasattr(tmp_env, 'env'):
            tmp_env = tmp_env.env
        else:
            break

    # We need to use the same statistics for normalization as used in training
    state_dict, ob_rms = torch.load(arg_load_path)

    actor_critic = create_policy(
        env.observation_space,
        nn_kwargs={
            'batch_norm': True,
            'hidden_size': 512,
        },
        train=False)

    actor_critic.load_state_dict(state_dict)
    actor_critic.to(device)

    masks = torch.zeros(num_env, 1).to(device)

    obs = env.reset()

    if arg_hide:
        render_func = None

    if render_func is not None:
        render_func('human')

    if arg_env_name.find('Bullet') > -1:
        import pybullet as p

        torsoId = -1
        for i in range(p.getNumBodies()):
            if p.getBodyInfo(i)[0].decode() == "torso":
                torsoId = i

    rewards = []
    wins = 0
    deaths = 0

    step = 0

    while True:
        step = step + 1
        with torch.no_grad():
            value, action, _ = actor_critic.act(
                obs, masks, deterministic=True)

        obs, reward, done, _ = env.step(action)

        masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done]).to(device)

        if arg_env_name.find('Bullet') > -1:
            if torsoId > -1:
                distance = 5
                yaw = 0
                humanPos, humanOrn = p.getBasePositionAndOrientation(torsoId)
                p.resetDebugVisualizerCamera(distance, yaw, -20, humanPos)

        for i, d in enumerate(done):
            if d:
                rewards.append(reward[i].item())
                if reward[i].item() > 0:
                    wins = wins + 1
                if reward[i].item() < 0 and step <= 800:
                    deaths = deaths + 1
                print("Game ended in {} steps, total games played: {}. Win rate: {}. Survival rate {}".format(step-1, len(rewards), float(wins) / len(rewards), 1.0-float(deaths)/len(rewards)))
                step = 0

        if render_func is not None:
            render_func('human')


In [8]:
render()

Create env, our agent starts at:  1
Game ended in 245 steps, total games played: 1. Win rate: 1.0. Survival rate 1.0


KeyboardInterrupt: 