In [1]:
from a2c_ppo_acktr.model import Policy
from a2c_ppo_acktr.algo import PPO, A2C_ACKTR
from a2c_ppo_acktr.storage import RolloutStorage, ExtendableStorage
from a2c_ppo_acktr import utils

import torch
import gym
import numpy as np
from collections import deque
from tqdm import tqdm
import random

In [2]:
env = gym.make('CartPole-v0')
env.action_space.shape = (1,)
env.action_space.high = np.array([1])
env.action_space.low = np.array([0])



In [3]:
actor_critic = Policy(
        (4,),
        env.action_space,
        base_kwargs={'recurrent': False})

In [4]:
class args(object):
    eval_interval = None
    log_interval = 10
    use_gae = False
    num_env_steps = 10e6
    num_steps = 32
    ppo_epoch = 4
    num_mini_batch = 32
    memory_capacity = 32
    value_loss_coef = 0.5
    entropy_coef = 0.01
    lr = 7e-5
    eps = 1e-5
    max_grad_norm = 0.05
    clip_param = 0.05
    alpha = 0.99
    gamma = 0.99
    gae_lambda = 0.95
    use_proper_time_limits = False

In [5]:
agent = A2C_ACKTR(
        actor_critic,
        args.value_loss_coef,
        args.entropy_coef,
        lr=args.lr,
        eps=args.eps,
        alpha=args.alpha,
        max_grad_norm=args.max_grad_norm,
        acktr=False)

In [6]:
memory = RolloutStorage(args.num_steps, 1,
                        (4,), env.action_space,
                        actor_critic.recurrent_hidden_state_size)
my_memory = ExtendableStorage()

In [7]:
episode_rewards = deque(maxlen=10)
value_losses = []
action_losses = []
dist_entropies = []

num_updates = int(args.num_env_steps) // args.num_steps

done = True
episode_reward = 0

for j in range(num_updates):
    memory = RolloutStorage(args.num_steps, 1,
                        (4,), env.action_space,
                        actor_critic.recurrent_hidden_state_size)
    my_memory.clear()
    if done:
        state = env.reset()
        memory.obs[0].copy_(torch.from_numpy(state).float())
        episode_reward = 0
    
    #utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr)
    for step in range(args.num_steps):
        # Sample actions
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                memory.obs[step], memory.recurrent_hidden_states[step],
                memory.masks[step])
            actual_action = int(action > 0.5)
        
        my_memory.insert(torch.from_numpy(state).float(), action, action_log_prob, value, unit_id=random.randint(0, 1))

        # Obser reward and next obs
        state, reward, done, info = env.step(actual_action)
        my_memory.reward(reward)

        episode_reward += reward
        
        # If done then clean the history of observations.
        mask = torch.FloatTensor([[1 - float(done)]])
        bad_mask = torch.FloatTensor([[1 - float('bad_transition' in info.keys())]])
        memory.insert(torch.from_numpy(state).float(), recurrent_hidden_states, action,
                        action_log_prob, value, torch.FloatTensor([[reward]]), mask, bad_mask)
        if done:
            episode_rewards.append(episode_reward)
            break
    break
    
    """with torch.no_grad():
        next_value = actor_critic.get_value(
            memory.obs[-1], memory.recurrent_hidden_states[-1],
            memory.masks[-1]).detach()

    memory.compute_returns(next_value, args.use_gae, args.gamma,
                           args.gae_lambda, args.use_proper_time_limits)
    
    value_loss, action_loss, dist_entropy = agent.update(memory)
    value_losses.append(value_loss)
    action_losses.append(action_loss)
    dist_entropies.append(dist_entropy)
    memory.after_update()
    
    if j % args.log_interval == 0 and len(episode_rewards) > 1:
        eval_rewards = []
        done = True
        for i in tqdm(range(250), desc='Eval'):
            _done = False
            state = env.reset()[:1]
            eval_rewards.append(0)
            while not _done:
                _, action, _, _ = actor_critic.act(torch.from_numpy(state).float().view((1, 1)), None, None)
                action = int(action > 0.5)
                state, reward, _done, _ = env.step(action)
                state = state[:1]
                eval_rewards[-1] += reward
        total_num_steps = (j + 1) * args.num_steps
        print(
            "Updates {}, num timesteps {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\neval episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
            .format(j, total_num_steps,
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), np.min(episode_rewards),
                    np.max(episode_rewards), np.mean(eval_rewards),
                    np.median(eval_rewards), np.min(eval_rewards),
                    np.max(eval_rewards)))"""

In [8]:
next_value = actor_critic.get_value(
            torch.from_numpy(state).float(), None,
            None).detach()

In [9]:
memory.compute_returns(next_value, False, args.gamma, 0.95, False)

In [10]:
my_memory.compute_returns(next_value, args.gamma, done=True)

In [11]:
my_memory.returns.shape

torch.Size([33, 1])

In [12]:
len(my_memory.obs)

32

In [13]:
len(my_memory.actions)

32

In [14]:
my_memory.value_preds.shape

torch.Size([33, 1])

In [15]:
len(my_memory.action_log_probs)

32

In [16]:
my_advantages = (my_memory.returns[:-1] - my_memory.value_preds[:-1])

In [17]:
gen = my_memory.feed_forward_generator(my_advantages, unit_id=0, num_mini_batch=1)

In [24]:
my_memory.obs

[tensor([ 0.0147, -0.0495, -0.0177,  0.0128]),
 tensor([ 0.0137,  0.1458, -0.0174, -0.2854]),
 tensor([ 0.0167, -0.0490, -0.0231,  0.0018]),
 tensor([ 0.0157, -0.2438, -0.0231,  0.2871]),
 tensor([ 0.0108, -0.0484, -0.0173, -0.0128]),
 tensor([ 0.0098,  0.1470, -0.0176, -0.3109]),
 tensor([ 0.0128, -0.0479, -0.0238, -0.0238]),
 tensor([ 0.0118,  0.1476, -0.0243, -0.3239]),
 tensor([ 0.0148, -0.0472, -0.0308, -0.0390]),
 tensor([ 0.0138, -0.2418, -0.0315,  0.2438]),
 tensor([ 0.0090, -0.4365, -0.0267,  0.5264]),
 tensor([ 0.0003, -0.2410, -0.0161,  0.2254]),
 tensor([-0.0046, -0.0457, -0.0116, -0.0723]),
 tensor([-0.0055,  0.1496, -0.0131, -0.3686]),
 tensor([-0.0025,  0.3449, -0.0204, -0.6654]),
 tensor([ 0.0044,  0.1501, -0.0338, -0.3792]),
 tensor([ 0.0074, -0.0445, -0.0413, -0.0974]),
 tensor([ 0.0065, -0.2390, -0.0433,  0.1820]),
 tensor([ 0.0017, -0.4335, -0.0396,  0.4607]),
 tensor([-0.0069, -0.6281, -0.0304,  0.7406]),
 tensor([-0.0195, -0.4325, -0.0156,  0.4385]),
 tensor([-0.0

In [19]:
for sample in gen:
    obs_batch = sample[0]
    print(obs_batch)
    break

tensor([[-5.1845e-02, -4.3239e-01,  7.1041e-03,  4.3534e-01],
        [ 6.5296e-03, -2.3904e-01, -4.3287e-02,  1.8200e-01],
        [-3.2877e-02, -4.3221e-01, -4.0290e-03,  4.3149e-01],
        [ 1.1820e-02,  1.4759e-01, -2.4286e-02, -3.2390e-01],
        [ 1.4772e-02, -4.7178e-02, -3.0764e-02, -3.8976e-02],
        [ 1.2777e-02, -4.7865e-02, -2.3810e-02, -2.3803e-02],
        [ 1.5681e-02, -2.4381e-01, -2.3078e-02,  2.8709e-01],
        [-2.8133e-02, -2.3719e-01, -6.8485e-03,  1.4098e-01],
        [ 1.4735e-02, -4.9526e-02, -1.7664e-02,  1.2847e-02],
        [ 2.6118e-04, -2.4102e-01, -1.6138e-02,  2.2545e-01],
        [-4.7101e-02, -2.3720e-01,  4.2779e-03,  1.4131e-01],
        [ 1.6661e-02, -4.9025e-02, -2.3114e-02,  1.7859e-03],
        [ 8.9912e-03, -4.3650e-01, -2.6667e-02,  5.2641e-01],
        [ 1.3744e-02,  1.4584e-01, -1.7407e-02, -2.8536e-01],
        [ 1.7488e-03, -4.3352e-01, -3.9647e-02,  4.6072e-01],
        [ 9.8372e-03,  1.4700e-01, -1.7592e-02, -3.1089e-01],
        

In [21]:
obs_batch.shape

torch.Size([18, 4])

In [None]:
while True:
    state = env.reset()
    episode_reward = 0
    env.render()
    while True:
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                torch.from_numpy(state).float(), None)
            action
            actual_action = 0
            if action > 0.5:
                actual_action = 1
        state, reward, done, _ = env.step(actual_action)
        episode_reward += reward
        env.render()
        if done:
            print(episode_reward)
            break

In [None]:
import matplotlib.pyplot as plt
n = 1000
def moving_average(a, k=n):
    ret = np.cumsum(a, dtype=float)
    ret[k:] = ret[k:] - ret[:-k]
    return ret[k - 1:] / k
plt.plot(list(range(len(action_losses)))[:1-n], moving_average(action_losses))
plt.show()
plt.plot(list(range(len(value_losses)))[:1-n], moving_average(value_losses))
plt.show()
plt.plot(list(range(len(dist_entropies)))[:1-n], moving_average(dist_entropies))