#### PPO Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
from gym.spaces import Box, Discrete

##### Model construction #####
def mlp(odim=24, hdims=[256,256], actv=nn.ReLU(), output_actv=None):
    layers = []
    prev_hdim = odim
    for hdim in hdims[:-1]:
        layers.append(nn.Linear(prev_hdim, hdim, bias=True))
        layers.append(actv)
        prev_hdim = hdim
    layers.append(nn.Linear(prev_hdim, hdims[-1]))
    if output_actv is None:
        return nn.Sequential(*layers)
    else:
        layers.append(output_actv)
        return nn.Sequential(*layers)

class CategoricalPolicy(nn.Module):
    def __init__(self, odim, adim, hdims=[64,64], actv=nn.ReLU(), output_actv=None):
        super(CategoricalPolicy, self).__init__()
        self.output_actv = output_actv
        self.net = mlp(odim, hdims=hdims, actv=actv, output_actv=output_actv)
        self.logits = nn.Linear(in_features=hdims[-1], out_features=adim)
    def forward(self, x, a=None):
        output = self.net(x)
        logits = self.logits(output)
        if self.output_actv:
            logits = self.output_actv(logits)
        prob = F.softmax(logits, dim=-1)
        dist = Categorical(probs=prob)
        pi = dist.sample()
        logp_pi = dist.log_prob(pi)
        logp = dist.log_prob(a)
        return pi, logp, logp_pi, pi

class GaussianPolicy(nn.Module):    # def mlp_gaussian_policy
    def __init__(self, odim, adim, hdims=[64,64], actv=nn.ReLU(), output_actv=None):
        super(GaussianPolicy, self).__init__()
        self.output_actv = output_actv
        self.mu = mlp(odim, hdims=hdims+[adim], actv=actv, output_actv=output_actv)
        self.log_std = nn.Parameter(-0.5*torch.ones(adim))
    def forward(self, x, a=None):
        mu = self.mu(x)
        std = self.log_std.exp()
        policy = Normal(mu, std)
        pi = policy.sample()
        # gaussian likelihood
        logp_pi = policy.log_prob(pi).sum(dim=1)
        if a is not None:
            logp = policy.log_prob(a).sum(dim=1)
        else:
            logp = None
        return pi, logp, logp_pi, mu        # 순서 ActorCritic return 값이랑 맞춤.

class ActorCritic(nn.Module):   # def mlp_actor_critic
    def __init__(self, odim, adim, hdims=[64,64], actv=nn.ReLU(),
                 output_actv=None, policy=None, action_space=None):
        super(ActorCritic,self).__init__()
        if policy is None and isinstance(action_space, Box):
            self.policy = GaussianPolicy(odim, adim, hdims, actv, output_actv)
        elif policy is None and isinstance(action_space, Discrete):
            self.policy = CategoricalPolicy(odim, adim, hdims, actv, output_actv)
        self.vf_mlp = mlp(odim, hdims=hdims+[1],
                          actv=actv, output_actv=output_actv)
    def forward(self, x, a=None):
        pi, logp, logp_pi, mu = self.policy(x, a)
        v = self.vf_mlp(x)
        return pi, logp, logp_pi, v, mu

#### PPO Class

In [None]:
import gym,pybullet_envs,time, psutil, torch

print("Pytorch version:[%s]."%(torch.__version__))
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print("device:[%s]."%(device))

class PPOAgent():
    def __init__(self):
        self.config = Config()
        self.env, self.eval_env = get_envs()
        odim = self.env.observation_space.shape[0]
        adim = self.env.action_space.shape[0]

        # Actor-critic model
        ac_kwargs = dict()
        ac_kwargs['action_space'] = self.env.action_space
        self.actor_critic = ActorCritic(odim, adim, self.config.hdims,**ac_kwargs)
        self.buf = PPOBuffer(odim=odim,adim=adim,size=self.config.steps_per_epoch,
                             gamma=self.config.gamma,lam=self.config.lam)

        # Optimizers
        self.train_pi = torch.optim.Adam(self.actor_critic.policy.parameters(), lr=self.config.pi_lr)
        self.train_v = torch.optim.Adam(self.actor_critic.vf_mlp.parameters(), lr=self.config.vf_lr)

        # model load
        #self.actor_critic.load_state_dict(torch.load('model_data/model_weights'))

    def update_ppo(self):
        self.actor_critic.train()

        obs, act, adv, ret, logp = [torch.Tensor(x) for x in self.buf.get()]

        obs = torch.FloatTensor(obs)
        act = torch.FloatTensor(act)
        adv = torch.FloatTensor(adv)
        ret = torch.FloatTensor(ret)
        logp_a_old = torch.FloatTensor(logp)

        # Policy gradient step
        for i in range(self.config.train_pi_iters):
            _, logp_a, _, _ = self.actor_critic.policy(obs, act)
            # pi, logp, logp_pi, mu

            # PPO objectives
            ratio = (logp_a - logp_a_old).exp()
            min_adv = torch.where(adv > 0, (1 + self.config.clip_ratio) * adv,
                                  (1 - self.config.clip_ratio) * adv)
            pi_loss = -(torch.min(ratio * adv, min_adv)).mean()

            self.train_pi.zero_grad()
            pi_loss.backward()
            self.train_pi.step()

            kl = torch.mean(logp_a_old - logp_a)
            if kl > 1.5 * self.config.target_kl:
                break

        # Value gradient step
        for _ in range(self.config.train_v_iters):
            v = self.actor_critic.vf_mlp(obs).squeeze()
            v_loss = F.mse_loss(v, ret)

            self.train_v.zero_grad()
            v_loss.backward()
            self.train_v.step()

    def main(self):
        start_time = time.time()
        o, r, d, ep_ret, ep_len, n_env_step = self.env.reset(), 0, False, 0, 0, 0

        self.actor_critic.eval()

        # Main loop: collect experience in env and update/log each epoch
        for epoch in range(self.config.epochs):
            if (epoch == 0) or (((epoch + 1) % self.config.print_every) == 0):
                print("[%d/%d]" % (epoch + 1, self.config.epochs))
            for t in range(self.config.steps_per_epoch):
                a, _, logp_t, v_t, _ = self.actor_critic(
                    torch.Tensor(o.reshape(1, -1)))  # pi, logp, logp_pi, v, mu

                o2, r, d, _ = self.env.step(a.detach().numpy()[0])
                ep_ret += r
                ep_len += 1
                n_env_step += 1

                # save and log  def store(self, obs, act, rew, val, logp):
                self.buf.store(o, a, r, v_t, logp_t)

                # Update obs (critical!)
                o = o2

                terminal = d or (ep_len == self.config.max_ep_len)
                if terminal or (t == (self.config.steps_per_epoch - 1)):
                    # if trajectory didn't reach terminal state, bootstrap value target
                    last_val = 0 if d else self.actor_critic.vf_mlp(torch.Tensor(o.reshape(1, -1))).item()
                    self.buf.finish_path(last_val)
                    o, ep_ret, ep_len = self.env.reset(), 0, 0

            # Perform PPO update!
            self.update_ppo()

            # # save model
            # if epoch % 10 == 0:
            #     torch.save(self.actor_critic.state_dict(), 'model_data/model_weights')
            #     print("Weight saved")

            # Evaluate
            self.actor_critic.eval()
            if (epoch == 0) or (((epoch + 1) % self.config.evaluate_every) == 0):
                ram_percent = psutil.virtual_memory().percent  # memory usage
                print("[Eval. start] step:[%d/%d][%.1f%%] #step:[%.1e] time:[%s] ram:[%.1f%%]." %
                      (epoch + 1, self.config.epochs, epoch / self.config.epochs * 100,
                       n_env_step,
                       time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)),
                       ram_percent)
                      )
                o, d, ep_ret, ep_len = self.eval_env.reset(), False, 0, 0
                _ = self.eval_env.render(mode='human')
                while not (d or (ep_len == self.config.max_ep_len)):
                    a, _, _, _ = self.actor_critic.policy(torch.Tensor(o.reshape(1, -1)))
                    o, r, d, _ = self.eval_env.step(a.detach().numpy()[0])
                    _ = self.eval_env.render(mode='human')
                    ep_ret += r  # compute return
                    ep_len += 1
                print("[Evaluate] ep_ret:[%.4f] ep_len:[%d]" % (ep_ret, ep_len))

        print("Done.")

        self.env.close()
        self.eval_env.close()

    def test(self):
        gym.logger.set_level(40)
        _, eval_env = get_envs()
        o, d, ep_ret, ep_len = eval_env.reset(), False, 0, 0
        _ = eval_env.render(mode='human')
        while not (d or (ep_len == self.config.max_ep_len)):
            a, _, _, _ = self.actor_critic.policy(torch.Tensor(o.reshape(1, -1)))
            o, r, d, _ = eval_env.step(a.detach().numpy()[0])
            _ = eval_env.render(mode='human')
            ep_ret += r  # compute return
            ep_len += 1
        print("[Evaluate] ep_ret:[%.4f] ep_len:[%d]"
              % (ep_ret, ep_len))
        eval_env.close()  # close env


def get_envs():
    env_name = 'AntBulletEnv-v0'
    env,eval_env = gym.make(env_name), gym.make(env_name)
    _ = eval_env.render(mode='human') # enable rendering on test_env
    _ = eval_env.reset()
    for _ in range(3): # dummy run for proper rendering
        a = eval_env.action_space.sample()
        o,r,d,_ = eval_env.step(a)
        time.sleep(0.01)
    return env,eval_env

#### ReplayBuffer

In [None]:
import numpy as np
import scipy.signal

def combined_shape(length, shape=None):
    if shape is None:
        return (length,)
    return (length, shape) if np.isscalar(shape) else (length, *shape)

def statistics_scalar(x, with_min_and_max=False):
    """
    Get mean/std and optional min/max of scalar x
    Args:
        x: An array containing samples of the scalar to produce statistics for.
        with_min_and_max (bool): If true, return min and max of x in
            addition to mean and std.
    """
    x = np.array(x, dtype=np.float32)
    global_sum, global_n = np.sum(x), len(x)
    mean = global_sum / global_n
    global_sum_sq = np.sum((x - mean)**2)
    std = np.sqrt(global_sum_sq / global_n)  # compute global std
    if with_min_and_max:
        global_min = (np.min(x) if len(x) > 0 else np.inf)
        global_max = (np.max(x) if len(x) > 0 else -np.inf)
        return mean, std, global_min, global_max
    return mean, std

def discount_cumsum(x, discount):
    """
    Compute discounted cumulative sums of vectors.
    input:
        vector x, [x0, x1, x2]
    output:
        [x0 + discount * x1 + discount^2 * x2,
         x1 + discount * x2,
         x2]
    """
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]


class PPOBuffer:
    """
    A buffer for storing trajectories experienced by a PPO agent interacting
    with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
    for calculating the advantages of state-action pairs.
    """

    def __init__(self, odim, adim, size=5000, gamma=0.99, lam=0.95):
        self.obs_buf = np.zeros(combined_shape(size, odim), dtype=np.float32)
        self.act_buf = np.zeros(combined_shape(size, adim), dtype=np.float32)
        self.act_old_buf = np.zeros(combined_shape(size, adim), dtype=np.float32) # added
        self.adv_buf = np.zeros(size, dtype=np.float32)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.ret_buf = np.zeros(size, dtype=np.float32)
        self.val_buf = np.zeros(size, dtype=np.float32)
        self.logp_buf = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.ptr, self.path_start_idx, self.max_size = 0, 0, size

    def store(self, obs, act, rew, val, logp):
        """
        Append one timestep of agent-environment interaction to the buffer.
        """
        assert self.ptr < self.max_size  # buffer has to have room so you can store
        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew

        self.val_buf[self.ptr] = val
        self.logp_buf[self.ptr] = logp
        self.ptr += 1

    def finish_path(self, last_val=0):
        path_slice = slice(self.path_start_idx, self.ptr)
        rews = np.append(self.rew_buf[path_slice], last_val)
        vals = np.append(self.val_buf[path_slice], last_val)

        # the next two lines implement GAE-Lambda advantage calculation
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam)

        # the next line computes rewards-to-go, to be targets for the value function
        self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1]
        self.path_start_idx = self.ptr

    def get(self):
        assert self.ptr == self.max_size  # buffer has to be full before you can get
        self.ptr, self.path_start_idx = 0, 0
        # the next two lines implement the advantage normalization trick
        adv_mean, adv_std = statistics_scalar(self.adv_buf)
        self.adv_buf = (self.adv_buf - adv_mean) / adv_std
        return [self.obs_buf, self.act_buf, self.adv_buf,
                self.ret_buf, self.logp_buf]

#### Ray Class

In [None]:

import datetime,gym,os,pybullet_envs,time,os,psutil,ray
import random
import torch

# Rollout Worker
def get_env():
    return gym.make('AntBulletEnv-v0')

def get_eval_env():
    eval_env = gym.make('AntBulletEnv-v0')
    if RENDER_ON_EVAL:
        _ = eval_env.render(mode='human') # enable rendering
    _ = eval_env.reset()
    for _ in range(3): # dummy run for proper rendering
        a = eval_env.action_space.sample()
        o,r,d,_ = eval_env.step(a)
        time.sleep(0.01)
    return eval_env

class RolloutWorkerClass(object):
    def __init__(self, seed=1):
        self.seed = seed
        self.env = get_env()
        odim, adim = self.env.observation_space.shape[0], self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim

        # Initialize PPO
        # Actor-critic model
        ac_kwargs = dict()
        ac_kwargs['action_space'] = self.env.action_space
        self.model = ActorCritic(odim, adim, hdims, **ac_kwargs)

        # # model load
        # self.model.load_state_dict(torch.load('model_data/model_weights'))
        # print("weight load")

        # Initialize model
        torch.manual_seed(self.seed)
        np.random.seed(self.seed)
        random.seed(self.seed)

        # Optimizers
        self.train_pi = torch.optim.Adam(self.model.policy.parameters(), lr=pi_lr)
        self.train_v = torch.optim.Adam(self.model.vf_mlp.parameters(), lr=vf_lr)

    def get_weights(self):
        weight_vals = self.model.state_dict()
        return weight_vals

    def set_weights(self, weight_vals):
        return self.model.load_state_dict(weight_vals)

@ray.remote
class RayRolloutWorkerClass(object):
    """
    Rollout Worker with RAY
    """
    def __init__(self, worker_id=0, ep_len_rollout=1000):
        # Parse
        self.worker_id = worker_id
        self.ep_len_rollout = ep_len_rollout

        # Each worker should maintain its own environment
        import pybullet_envs, gym
        gym.logger.set_level(40)  # gym logger
        self.env = get_env()
        odim, adim = self.env.observation_space.shape[0], self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim

        # Replay buffers to pass
        self.o_buffer = np.zeros((self.ep_len_rollout, self.odim))
        self.a_buffer = np.zeros((self.ep_len_rollout, self.adim))
        self.r_buffer = np.zeros((self.ep_len_rollout))
        self.v_t_buffer = np.zeros((self.ep_len_rollout))
        self.logp_t_buffer = np.zeros((self.ep_len_rollout))

        # Create PPO model
        # Actor-critic model
        ac_kwargs = dict()
        ac_kwargs['action_space'] = self.env.action_space
        self.model = ActorCritic(odim, adim, hdims, **ac_kwargs)
        # Buffer
        self.buf = PPOBuffer(odim=self.odim, adim=self.adim,
                             size=self.ep_len_rollout, gamma=gamma, lam=lam)
        # Flag to initialize rollout
        self.FIRST_ROLLOUT_FLAG = True

    def set_weights(self, weight_vals):
        return self.model.load_state_dict(weight_vals)

    def rollout(self):
        if self.FIRST_ROLLOUT_FLAG:
            self.FIRST_ROLLOUT_FLAG = False
            self.o = self.env.reset()  # reset environment
        # Loop
        for t in range(self.ep_len_rollout):
            a, _, logp_t, v_t, _ = self.model(torch.Tensor(self.o.reshape(1, -1)))  # pi, logp, logp_pi, v, mu
            o2, r, d, _ = self.env.step(a.detach().numpy()[0])
            # save and log
            self.buf.store(self.o, a, r, v_t, logp_t)
            # Update obs (critical!)
            self.o = o2
            if d:
                self.buf.finish_path(last_val=0.0)
                self.o = self.env.reset()  # reset when done
        last_val = self.model.vf_mlp(torch.Tensor(self.o.reshape(1, -1))).item()
        self.buf.finish_path(last_val)
        return self.buf.get()

#### Params

In [None]:
# Model
hdims = [32, 32]
#Graph
clip_ratio = 0.2
pi_lr = 3e-4
vf_lr = 1e-3
epsilon = 1e-2
#Buffer
gamma = 0.99
lam = 0.95
#Update
train_pi_iters = 100
train_v_iters = 100
target_kl = 0.01
epochs = 1000
max_ep_len = 1000
#Worker
n_cpu = n_workers = 2
total_steps = 1000
evaluate_every = 50
print_every = 10
ep_len_rollout = 500
batch_size = 4096

RENDER_ON_EVAL = False

#### Functions for plotting

In [None]:
from matplotlib import pyplot as plt
import pickle

def save_plot_data(plot_dict,file_name):
    # save dictionary to json
    with open('plot_data/%s.json'%file_name,'wb') as fp:
        pickle.dump(plot_dict, fp)
    return print("plot_data saved!")

def draw_subplt_graph(file_name1, file_name2):
    # load json file
    with open('plot_data/%s.json'%file_name1,'rb') as fp:
        plot_dict1 = pickle.load(fp)
    with open('plot_data/%s.json'%file_name2,'rb') as fp2:
        plot_dict2 = pickle.load(fp2)
    plt.figure(figsize=(10,8))
    # plt.subplots_adjust(hspace=0.5)
    # make ep_ret-total step graph
    plt.subplot(2,1,1)
    plt.plot(list(plot_dict1.keys()),plot_dict1.values(),marker='o')
    plt.xlabel('step')
    plt.ylabel('ep_return')
    plt.title("hdim:%s gamma:[%.4f] lam:[%.4f] clip_ratio:[%.4f]\n epsilon:[%.4f] ep_len_rollout:[%d]\n"
              %(hdims,gamma,lam,clip_ratio,epsilon,ep_len_rollout))
    plt.grid(True, linestyle='--')

    plt.subplot(2,1,2)
    plt.plot(list(plot_dict2.keys()),plot_dict2.values(),marker='o')
    plt.xlabel('time')
    plt.xticks(rotation=45)
    plt.ylabel('ep_return')
    # plt.title("hdim:%s gamma:[%.4f] lam:[%.4f] clip_ratio:[%.4f]\n epsilon:[%.4f] ep_len_rollout:[%d]"
    #           %(hdims,gamma,lam,clip_ratio,epsilon,ep_len_rollout))
    plt.grid(True, linestyle='--')

    plt.savefig('plot_data/plot_images/%s.png'%file_name1,dpi=100)
    #plt.show()

def draw_graph(file_name):
    # load json file
    with open('plot_data/%s.json'%file_name,'rb') as fp:
        plot_dict = pickle.load(fp)

    # make ep_ret-total step graph
    plt.subplot(2,1,1)
    plt.plot(list(plot_dict.keys()),plot_dict.values(),marker='o')
    plt.xlabel('step')
    plt.ylabel('ep_return')
    plt.title("hdim:%s gamma:[%.4f] lam:[%.4f] clip_ratio:[%.4f]\n epsilon:[%.4f] ep_len_rollout:[%d]"
              %(hdims,gamma,lam,clip_ratio,epsilon,ep_len_rollout))
    plt.grid(True, linestyle='--')

    plt.savefig('plot_data/plot_images/%s.png'%file_name,dpi=100)

#### Main Function

In [None]:
def main():
    print("Pytorch version:[%s]."%(torch.__version__))

    # Initialize PyBullet Ant Environment
    eval_env = get_eval_env()
    adim,odim = eval_env.action_space.shape[0],eval_env.observation_space.shape[0]
    print("Environment Ready. odim:[%d] adim:[%d]."%(odim,adim))

    # Initialize Workers
    ray.init(num_cpus=n_cpu)

    R = RolloutWorkerClass(seed=0)
    workers = [RayRolloutWorkerClass.remote(worker_id=i,ep_len_rollout=ep_len_rollout)
            for i in range(int(n_workers))]
    print("RAY initialized with [%d] cpus and [%d] workers."%
        (n_cpu, n_workers))
    time.sleep(1)
    # Loop
    start_time = time.time()
    n_env_step = 0  # number of environment steps
    plot_dict = {} # for visualization
    plot_dict_time = {} # for visualization
    for t in range(int(total_steps)):
        esec = time.time() - start_time
        # 1. Synchronize worker weights
        weights = R.get_weights()
        set_weights_list = [worker.set_weights.remote(weights) for worker in workers]
        # 2. Make rollout and accumulate to Buffers
        t_start = time.time()
        ops = [worker.rollout.remote() for worker in workers]
        rollout_vals = ray.get(ops)
        sec_rollout = time.time() - t_start
        # 3. Update
        t_start = time.time()  # tic
        # Mini-batch type of update
        for r_idx, rval in enumerate(rollout_vals):
            obs_buf, act_buf, adv_buf, ret_buf, logp_buf = \
                rval[0], rval[1], rval[2], rval[3], rval[4]
            if r_idx == 0:
                obs_bufs, act_bufs, adv_bufs, ret_bufs, logp_bufs = \
                    obs_buf, act_buf, adv_buf, ret_buf, logp_buf
            else:
                obs_bufs = np.concatenate((obs_bufs, obs_buf), axis=0)
                act_bufs = np.concatenate((act_bufs, act_buf), axis=0)
                adv_bufs = np.concatenate((adv_bufs, adv_buf), axis=0)
                ret_bufs = np.concatenate((ret_bufs, ret_buf), axis=0)
                logp_bufs = np.concatenate((logp_bufs, logp_buf), axis=0)
        n_val_total = obs_bufs.shape[0]
        for pi_iter in range(int(train_pi_iters)):
            rand_idx = np.random.permutation(n_val_total)[:batch_size]
            buf_batches = [obs_bufs[rand_idx], act_bufs[rand_idx], adv_bufs[rand_idx],
                        ret_bufs[rand_idx], logp_bufs[rand_idx]]
            obs, act, adv, ret, logp_a_old = [torch.Tensor(x) for x in buf_batches]
            ent = (-logp_a_old).mean()
            _, logp_a, _, _ = R.model.policy(obs, act)
            # PPO objectives
            ratio = (logp_a - logp_a_old).exp()
            min_adv = torch.where(adv > 0, (1 + clip_ratio) * adv, (1 - clip_ratio) * adv)
            pi_loss = -(torch.min(ratio * adv, min_adv)).mean()
            R.train_pi.zero_grad(set_to_none=True)
            pi_loss.backward()
            R.train_pi.step()
            # a sample estimate for KL-divergence
            kl = torch.mean(logp_a_old - logp_a)
            if kl > 1.5 * target_kl:
                #print("  pi_iter:[%d] kl(%.3f) is higher than 1.5x(%.3f)" % (pi_iter, kl, target_kl))
                break
        # Value gradient step
        for _ in range(int(train_v_iters)):
            rand_idx = np.random.permutation(n_val_total)[:batch_size]
            buf_batches = [obs_bufs[rand_idx], act_bufs[rand_idx], adv_bufs[rand_idx],
                        ret_bufs[rand_idx], logp_bufs[rand_idx]]
            obs, act, adv, ret, logp = [torch.Tensor(x) for x in buf_batches]
            v = R.model.vf_mlp(obs).squeeze()
            v_loss = F.mse_loss(v, ret)
            R.train_v.zero_grad(set_to_none=True)
            v_loss.backward()
            R.train_v.step()
        sec_update = time.time() - t_start  # toc
        # Print
        if (t == 0) or (((t + 1) % print_every) == 0):
            print("[%d/%d] rollout:[%.1f]s pi_iter:[%d/%d] update:[%.1f]s kl:[%.4f] target_kl:[%.4f]." %
                (t + 1, total_steps, sec_rollout, pi_iter, train_pi_iters, sec_update, kl, target_kl))
            print("   pi_loss:[%.4f], entropy:[%.4f]"%(pi_loss, ent))

        # Evaluate
        if (t == 0) or (((t + 1) % evaluate_every) == 0):
            ram_percent = psutil.virtual_memory().percent  # memory usage
            times = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
            print("[Eval. start] step:[%d/%d][%.1f%%] #step:[%.1e] time:[%s] ram:[%.1f%%]." %
                (t + 1, total_steps, t / total_steps * 100,
                n_env_step,times,ram_percent)
                )
            plot_dict[t] = 0    #for visualization
            plot_dict_time[times] = 0   #for visualization
            o, d, ep_ret, ep_len = eval_env.reset(), False, 0, 0
            if RENDER_ON_EVAL:
                _ = eval_env.render(mode='human')

            while not (d or (ep_len == max_ep_len)):
                a, _, _, _ = R.model.policy(torch.Tensor(o.reshape(1, -1)))
                o, r, d, _ = eval_env.step(a.detach().numpy()[0])
                if RENDER_ON_EVAL:
                    _ = eval_env.render(mode='human')
                ep_ret += r  # compute return
                ep_len += 1
            print("[Evaluate] ep_ret:[%.4f] ep_len:[%d]" % (ep_ret, ep_len))
            plot_dict[t] = ep_ret #for visualization
            plot_dict_time[times] = ep_ret #for visualization

    # for visualization
    file_name = 'PPO_plt_data_2'
    file_name2 = 'PPO_plt_data_time_2'

    save_plot_data(plot_dict=plot_dict,file_name=file_name)
    save_plot_data(plot_dict=plot_dict_time,file_name=file_name2)
    draw_subplt_graph(file_name1=file_name,file_name2=file_name2)

    # Close
    print("Done.")
    eval_env.close()
    ray.shutdown()

#### Main Loop

In [None]:
main()