In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Start virtual display
# pip install PyVirtualDisplay
# sudo apt-get install xvfb

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
import os
os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)

In [3]:
from REINFORCE_helper import RunningVariance
from time import time
from REINFORCE_helper import BaseAgent, format_as_pandas
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import Adam, SGD
import keras.backend as K
import numpy as np

Using TensorFlow backend.


In [4]:
class ReinforceAgent(BaseAgent):
    # def __init__(self):
    def actor_critic_loss_continuous(self, advantage):
        def loss(y_true, y_pred):
            var = K.square(self.noise)
            denom = K.sqrt(2.0 * np.pi * var)
            prob_num = K.exp(- K.square(y_true - y_pred) / (2.0 * var))
            prob = prob_num/denom
            return -K.mean(prob * advantage)
        return loss
    
    def get_policy_model(self, lr=0.001, hidden_layer_neurons = 128, input_shape=[4], output_shape=2):
        state_input = Input(shape=input_shape)
        advantage = Input(shape=(1,))
        x = Dense(hidden_layer_neurons, activation='relu')(state_input)
        x = Dense(hidden_layer_neurons, activation='relu')(x)
        out_actions = Dense(output_shape, activation='tanh')(x)
        
        model = Model(inputs=[state_input, advantage], outputs=[out_actions])
        model.compile(Adam(lr), loss=self.actor_critic_loss_continuous(advantage))
        return model
    
    def get_action(self, eval=False):
        dummy_adv = np.zeros((1, 1))
        obs = self.scaler.transform(self.observation.reshape(1, self.nS))
        p = self.model.predict([obs, dummy_adv])*self.env.action_space.high
        if eval is False:
            action = action_one_hot = p[0] + np.random.normal(loc=0, scale=self.noise, size=p[0].shape)
            # action = action_one_hot = np.clip(action, self.env.action_space.low, self.env.action_space.high)
        else:
            action = action_one_hot = p[0]
        
        return action, action_one_hot, p
    
    
    def get_entropy(self, preds, epsilon=1e-12):
        # entropy = np.mean(-np.sum(np.log(preds+epsilon)*preds, axis=1)/np.log(self.nA))
        return 1
    
    def get_discounted_rewards(self, r):
        # Por si es una lista
        r = np.array(r, dtype=float)
        """Take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, r.size)):
            running_add = running_add * self.gamma + r[t]
            discounted_r[t] = running_add
        return discounted_r 

In [None]:
# reinforce_agent = ReinforceAgent('MountainCarContinuous-v0', n_experience_episodes=2, EPISODES=2000, epochs=1, lr=0.001, algorithm='REINFORCE_CAUSAL')
reinforce_agent = ReinforceAgent('Pendulum-v0', n_experience_episodes=2, EPISODES=2000, epochs=1, lr=0.001, algorithm='REINFORCE_CAUSAL')

Instructions for updating:
Colocations handled automatically by placer.


In [None]:
print(reinforce_agent.env.action_space.low,
reinforce_agent.env.action_space.high)

[-2.] [2.]


In [None]:
reinforce_agent.model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               512       
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 17,153
Trainable params: 17,153
Non-trainable params: 0
_________________________________________________________________


In [None]:
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, last_obs, time_step = reinforce_agent.get_experience_episodes(return_ts=True)

In [None]:
print(actions[:10])

[[-0.80621183]
 [-1.12020307]
 [-0.94194067]
 [-0.59158756]
 [-0.88948428]
 [ 0.18423736]
 [ 0.04995022]
 [-1.74832955]
 [ 0.25475455]
 [-0.38058892]]


In [None]:
print(preds[:10])

[[-0.37705782]
 [-0.35253835]
 [-0.32969052]
 [-0.29659286]
 [-0.24854067]
 [-0.21147202]
 [-0.16380957]
 [-0.13274129]
 [-0.12833874]
 [-0.13430135]]


In [None]:
obs

array([[-6.65765940e-01,  7.46160648e-01, -2.73168970e-01],
       [-6.71918286e-01,  7.40625288e-01,  1.65519743e-01],
       [-6.92135627e-01,  7.21767465e-01,  5.52958247e-01],
       ...,
       [-2.63224072e-01, -9.64734724e-01,  5.76068944e+00],
       [ 4.94307316e-04, -9.99999878e-01,  5.33713839e+00],
       [ 2.24875322e-01, -9.74387546e-01,  4.52641544e+00]])

In [None]:
reinforce_agent = ReinforceAgent('MountainCarContinuous-v0', n_experience_episodes=1, eval_period=5, 
                                 EPISODES=2000, epochs=1, lr=0.0001, algorithm='CONTINUOUS_CAUSAL', noise=3.0, gif_to_board=True)

# reinforce_agent = ReinforceAgent('Pendulum-v0', n_experience_episodes=1, 
#                                  EPISODES=10000, epochs=1, lr=0.0001, algorithm='CONTINUOUS_CAUSAL', noise=2.0)

initial_time = time()
running_variance = RunningVariance()


while reinforce_agent.episode < reinforce_agent.EPISODES:
    obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, last_obs = reinforce_agent.get_experience_episodes()
    
    for dr in disc_sum_rews:
        running_variance.add(dr)

    
    history = reinforce_agent.model.fit([obs, disc_sum_rews.reshape(-1, 1)], 
                                        actions, verbose=0, 
                                        epochs=reinforce_agent.epochs, 
                                        batch_size=1)
    
    reinforce_agent.log_data(reinforce_agent.episode, 
                      history.history['loss'][0], 
                      np.mean(ep_len), 
                      reinforce_agent.get_entropy(preds), 
                      running_variance.get_variance(), 
                      None, 
                      time() - initial_time, np.mean(ep_returns[-1]))

correr en linea de comando: tensorboard --logdir logs/
Episode: 5
Model on episode 6 improved from -inf to -23.607904508678107. Saved!


t:   1%|▏         | 13/1000 [00:00<00:07, 129.97it/s, now=None]

MoviePy - Building file /tmp/tmpap8582rt.gif with imageio.




Episode: 12
Model on episode 13 improved from -23.607904508678107 to 36.39102148286267. Saved!


t:   3%|▎         | 14/473 [00:00<00:03, 133.52it/s, now=None]

MoviePy - Building file /tmp/tmpn82ighzr.gif with imageio.




Episode: 117

In [None]:
from matplotlib import pyplot as plt

In [None]:
# obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len = reinforce_agent.get_eval_episode('MountainCarContinuous.gif', fps=50)
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len = reinforce_agent.get_eval_episode()

In [None]:
plt.plot(preds)
plt.plot(actions)

In [None]:
obs, actions, preds, disc_sum_rews, rewards, ep_returns, ep_len, last_obs = reinforce_agent.get_experience_episodes()

In [None]:
plt.plot(preds)
plt.plot(actions)