In [1]:
from a2c_ppo_acktr.model import Policy
from a2c_ppo_acktr.algo import PPO
from a2c_ppo_acktr.storage import RolloutStorage
from a2c_ppo_acktr import utils

import torch
import gym
import numpy as np
from collections import deque
from tqdm import tqdm

In [2]:
env = gym.make('CartPole-v0')
env.action_space.shape = (1,)
env.action_space.high = np.array([1])
env.action_space.low = np.array([0])



In [3]:
actor_critic = Policy(
        env.observation_space.shape,
        env.action_space,
        base_kwargs={'recurrent': False})

In [4]:
class args(object):
    eval_interval = None
    log_interval = 10
    use_gae = False
    num_env_steps = 10e6
    num_steps = 32
    clip_param = 0.2
    ppo_epoch = 4
    num_mini_batch = 32
    value_loss_coef = 0.5
    entropy_coef = 0.01
    lr = 7e-4
    eps = 1e-5
    max_grad_norm = 0.5
    gamma = 0.99
    gae_lambda = 0.95
    use_proper_time_limits = False

In [5]:
agent = PPO(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm)

In [6]:
memory = RolloutStorage(args.num_steps, 1,
                        env.observation_space.shape, env.action_space,
                        actor_critic.recurrent_hidden_state_size)

In [7]:
episode_rewards = deque(maxlen=10)

num_updates = int(args.num_env_steps) // args.num_steps

done = True
episode_reward = 0

for j in range(num_updates):
    if done:
        state = env.reset()
        memory.obs[0].copy_(torch.from_numpy(state).float())
        episode_reward = 0
    
    utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr)
    for step in range(args.num_steps):
        # Sample actions
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                memory.obs[step], memory.recurrent_hidden_states[step],
                memory.masks[step])
            actual_action = 0
            if action > 0.5:
                actual_action = 1

        # Obser reward and next obs
        state, reward, done, info = env.step(actual_action)

        episode_reward += reward
        
        if done:
            episode_rewards.append(episode_reward)

        # If done then clean the history of observations.
        mask = torch.FloatTensor([[1 - float(done)]])
        bad_mask = torch.FloatTensor([[1 - float('bad_transition' in info.keys())]])
        memory.insert(torch.from_numpy(state).float(), recurrent_hidden_states, action,
                        action_log_prob, value, torch.FloatTensor([[reward]]), mask, bad_mask)
    
    with torch.no_grad():
        next_value = actor_critic.get_value(
            memory.obs[-1], memory.recurrent_hidden_states[-1],
            memory.masks[-1]).detach()

    memory.compute_returns(next_value, args.use_gae, args.gamma,
                           args.gae_lambda, args.use_proper_time_limits)
    
    value_loss, action_loss, dist_entropy = agent.update(memory)
    memory.after_update()
    
    if j % args.log_interval == 0 and len(episode_rewards) > 1:
        total_num_steps = (j + 1) * args.num_steps
        print(
            "Updates {}, num timesteps {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
            .format(j, total_num_steps,
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), np.min(episode_rewards),
                    np.max(episode_rewards), dist_entropy, value_loss,
                    action_loss))



Updates 0, num timesteps 32 
 Last 6 training episodes: mean/median reward 27.0/27.0, min/max reward 27.0/27.0

Updates 10, num timesteps 352 
 Last 10 training episodes: mean/median reward 38.0/38.0, min/max reward 38.0/38.0

Updates 20, num timesteps 672 
 Last 10 training episodes: mean/median reward 18.0/18.0, min/max reward 18.0/18.0

Updates 30, num timesteps 992 
 Last 10 training episodes: mean/median reward 24.0/24.0, min/max reward 24.0/24.0

Updates 40, num timesteps 1312 
 Last 10 training episodes: mean/median reward 21.0/21.0, min/max reward 21.0/21.0

Updates 50, num timesteps 1632 
 Last 10 training episodes: mean/median reward 27.6/29.0, min/max reward 19.0/32.0

Updates 60, num timesteps 1952 
 Last 10 training episodes: mean/median reward 52.0/52.0, min/max reward 52.0/52.0

Updates 70, num timesteps 2272 
 Last 10 training episodes: mean/median reward 74.0/74.0, min/max reward 74.0/74.0

Updates 80, num timesteps 2592 
 Last 10 training episodes: mean/median reward 

KeyboardInterrupt: 

In [8]:
while True:
    state = env.reset()
    episode_reward = 0
    env.render()
    while True:
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                torch.from_numpy(state).float(), None)
            actual_action = 0
            if action > 0.5:
                actual_action = 1
        state, reward, done, _ = env.step(actual_action)
        episode_reward += reward
        env.render()
        if done:
            print(episode_reward)
            break

196.0
200.0
200.0
178.0
200.0
185.0
200.0
200.0
200.0
173.0
198.0
199.0
200.0
182.0
200.0
200.0
200.0
162.0
200.0
200.0
200.0
175.0
200.0
200.0
200.0


KeyboardInterrupt: 