In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
import os
os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)
import moviepy.editor as mpy

In [3]:
from REINFORCE_helper import RunningVariance
from time import time
from REINFORCE_helper import BaseAgent
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam, SGD
import keras.backend as K
import numpy as np

Using TensorFlow backend.


In [4]:
class ReinforceAgent(BaseAgent):
    # def __init__(self):
    def get_policy_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=2):
        ## Defino métrica - loss sin el retorno multiplicando
        def loss_metric(y_true, y_pred):
            y_true_norm = K.sign(y_true)
            return K.categorical_crossentropy(y_true_norm, y_pred)
        model = Sequential()
        model.add(Dense(hidden_layer_neurons, input_shape=input_shape, activation='relu'))
        model.add(Dense(output_shape, activation='softmax'))
        ## Por que la categorical_crossentropy funciona ok?
        model.compile(Adam(lr), loss=['categorical_crossentropy'], metrics=[loss_metric])
        return model
    
    def get_action(self, eval=False):
        p = self.model.predict([self.observation.reshape(1, self.nS)])
        if eval is False:
            action = np.random.choice(self.nA, p=p[0]) #np.nan_to_num(p[0])
        else:
            action = np.argmax(p[0])
        action_one_hot = np.zeros(self.nA)
        action_one_hot[action] = 1
        return action, action_one_hot, p
    
    def get_entropy(self, preds, epsilon=1e-12):
        entropy = np.mean(-np.sum(np.log(preds+epsilon)*preds, axis=1)/np.log(self.nA))
        return entropy
    
    def get_critic_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=1):
        model = Sequential()
        model.add(Dense(hidden_layer_neurons, input_shape=input_shape, activation='tanh'))
        model.add(Dense(output_shape, activation='linear'))
        model.compile(Adam(lr), loss=['mse'])
        return model
    
    def get_discounted_rewards(self, r):
        # Por si es una lista
        r = np.array(r, dtype=float)
        """Take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, r.size)):
            running_add = running_add * self.gamma + r[t]
            discounted_r[t] = running_add
        return discounted_r 

![actor_critic_alg.png](actor_critic_alg.png)

![adv_n_steps.png](adv_n_steps.png)

In [5]:
def compute_n_step_targets(rewards, values, gamma=0.999, n_steps = 5):
    out = []
    ep_len = len(rewards)
    gammas = np.power(gamma, range(ep_len+1)) # El +1 es un hack para que no cuelgue cuando le pasamos n_steps = len(rewards)
    padded_values = np.vstack([values, np.zeros([n_steps, 1])])
    for t in range(ep_len):
        # t desde donde comienzo, por ejemplo si t=0 sumo desde 0 a n_steps-1
        rewards_left = min([0, ep_len-t-n_steps])
        first_term = (gammas[:(n_steps+rewards_left)]*rewards[t:t+n_steps]).sum()
        A_t =  first_term - padded_values[t] + gammas[n_steps]*padded_values[t+n_steps]
        out.append(A_t)
    return np.array(out)

In [6]:
# print(compute_n_step_targets(rewards, values, gamma=reinforce_agent.gamma,  n_steps = 1)[-10:].reshape(-1))
# print((rewards.reshape(-1,1) + reinforce_agent.gamma*values[1:] - values[:-1])[-10:].reshape(-1))

In [7]:
# print(compute_n_step_targets(rewards, values, gamma=reinforce_agent.gamma,  n_steps = len(rewards))[:10].reshape(-1))
# print((disc_sum_rews.reshape(-1, 1) - values[:-1])[:10].reshape(-1))

In [8]:
reinforce_agent = ReinforceAgent('LunarLander-v2', n_experience_episodes=1, EPISODES=2000, epochs=1, 
                                 lr=0.01, algorithm='ACTOR_CRITIC_N_STEPS', gif_to_board=True, batch_size=128)

initial_time = time()
running_variance = RunningVariance()
critic_model = reinforce_agent.get_critic_model(lr=0.001, 
                                           hidden_layer_neurons=128,
                                           input_shape=[reinforce_agent.nS],
                                           output_shape=1)


while reinforce_agent.episode < reinforce_agent.EPISODES:
    obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, last_obs, time_steps = reinforce_agent.get_experience_episodes(return_ts=True)
    
    history_critic = critic_model.fit(obs, disc_sum_rews, verbose=0, epochs=1, batch_size=reinforce_agent.batch_size)
    
    all_obs = np.vstack([obs, [last_obs]])
    values = critic_model.predict(all_obs)
            
    advantage = compute_n_step_targets(rewards, values, gamma=reinforce_agent.gamma, n_steps = 3)

    for ad in advantage:
        running_variance.add(ad)

    pseudolabels = actions*advantage
    
    history_loss = reinforce_agent.model.fit(obs, pseudolabels, verbose=0, epochs=reinforce_agent.epochs, batch_size=reinforce_agent.batch_size)
    
    reinforce_agent.log_data(reinforce_agent.episode, 
                      history_loss.history['loss'][0], 
                      np.mean(ep_len), 
                      reinforce_agent.get_entropy(preds), 
                      running_variance.get_variance(), 
                      history_loss.history['loss_metric'][0], 
                      time() - initial_time, np.mean(ep_returns[-1]), 
                      history_critic.history['loss'][0])
    
reinforce_agent.writer.close()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
correr en linea de comando: tensorboard --logdir logs/
Episode: 50
Model on episode 51 improved from -inf to -739.8845748918739. Saved!


t:   1%|▏         | 14/1001 [00:00<00:07, 131.42it/s, now=None]

MoviePy - Building file /tmp/tmp1z1ijcbt.gif with imageio.




Episode: 102
Model on episode 103 improved from -739.8845748918739 to -192.20593634572998. Saved!


t:   4%|▍         | 14/328 [00:00<00:02, 135.66it/s, now=None]

MoviePy - Building file /tmp/tmp2uvneo2a.gif with imageio.




Episode: 206
Model on episode 207 improved from -192.20593634572998 to -178.52654533650679. Saved!


t:   4%|▍         | 14/314 [00:00<00:02, 133.68it/s, now=None]

MoviePy - Building file /tmp/tmpomd09qwh.gif with imageio.




Episode: 258
Model on episode 259 improved from -178.52654533650679 to -61.78141719184408. Saved!


t:   3%|▎         | 14/491 [00:00<00:03, 134.90it/s, now=None]

MoviePy - Building file /tmp/tmpb0uak1vj.gif with imageio.




Episode: 362
Model on episode 363 improved from -61.78141719184408 to 12.399358122732927. Saved!


t:   3%|▎         | 14/407 [00:00<00:02, 135.52it/s, now=None]

MoviePy - Building file /tmp/tmprneym5hj.gif with imageio.




Episode: 508

KeyboardInterrupt: 