### Entropy

In [1]:
import gym, time
import numpy as np
import matplotlib.pyplot as plt
from agent import Agent
from collections import Counter
import random
%matplotlib inline

#Environment
seed = 0
env = gym.make('CartPole-v0')
env.seed(seed)  # for comparison
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n

#Agent
gamma, lr, tau = 0.99, 0.001, 0.0001
agent = Agent(num_states, num_actions, lr, gamma, seed_num = seed)
agent.memory_size = 10**4
agent.batchsize = 32
learning_start = 2000
agent.tau = tau

#Train
EPISODES = 20
scores = []
t1 = time.time()
for e in range(1,EPISODES+1):
    state = env.reset()
    reward_sum = 0
    done = False
    steps = 0
    actions = []
    
    while not done:
        #env.render()
        state = np.reshape(state, [1, num_states])  #reshape for keras
        action_onehot = agent.act(state)
        action_scalar = np.dot(action_onehot,range(num_actions))
        actions.append(action_scalar)
        next_state, reward, done, _ = env.step(action_scalar)
        reward_sum += reward
        agent.remember(state[0], action_onehot, reward, next_state, done)
        state = next_state
        
        if len(agent.memory) > learning_start:
            agent.train_models()
            agent.actor.gumbel_temperature = max(0.999*agent.actor.gumbel_temperature, 0.1)
        steps += 1
    
    #Learn & print results
    scores.append(reward_sum)
    if e % 10 == 0:
        t2 = time.time()
        print '(episode, score, T (mins)) = ' + str((e,reward_sum, (t2-t1)/60.0))
        print 'actions counts = ' + str(Counter(actions)) + '\n'

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m
Tensor("dense_3/BiasAdd:0", shape=(?, 2), dtype=float32)
(episode, score, T (mins)) = (10, 27.0, 0.0022733171780904135)
actions counts = Counter({1: 18, 0: 9})

(episode, score, T (mins)) = (20, 16.0, 0.0035917162895202637)
actions counts = Counter({1: 11, 0: 5})



In [3]:
#Do experience replay
if len(agent.memory) < agent.batchsize:
    minibatch = agent.memory
else:
    minibatch = random.sample(agent.memory,agent.batchsize)


#Actor update
states, actions = agent.extract_from_batch(minibatch)
grad_actions = agent.critic.find_action_grads([states,actions])[0]
agent.actor.learn(states,grad_actions)
agent.soft_update_target_network(agent.actor)

#Critic update
agent.critic.learn(minibatch)
agent.soft_update_target_network(agent.critic)

In [20]:
import numpy as np
import tensorflow as tf
from keras.initializers import RandomUniform
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Reshape, LSTM, Lambda, BatchNormalization, GaussianNoise, Flatten
from keras import backend as K
from keras.optimizers import Adam 

def GumbelNoise(logits):
    """ Adds gumbels noise to the logits
        I generate the gumbel noise by 
        applying the inverse CDF to uniform
        noise. 

        The inverse CDF of the gumbel is
        -log( -log(x) )

    """
    U = K.random_uniform(K.shape(logits), 0, 1)
    y = logits - K.log(-K.log(U + 1e-20) + 1e-20) # logits + gumbel noise
    return y

num_states = env.observation_space.shape[0]
num_actions = env.action_space.n
gumbel_temp = 0.1

#Usual  2-layer MLP with parameter noise (should I leave out the parameter noise?)
inp = Input(shape = (num_states,))
x = Dense(64, activation='relu')(inp)
x = GaussianNoise(1.0)(x)
x = Dense(64, activation='relu')(x)
x = GaussianNoise(1.0)(x)
logits = Dense(num_actions, kernel_initializer=RandomUniform())(x)

# Now do the softmax gumbel trick: (which outputs a one-hot vector)
# Apply softmax to (g_i + logits) / temperate
# where g_i is gumbel noise, and temperature is a 
# softness par (when small, almost exactly a one-hot vec)
z = Lambda(GumbelNoise)(logits)    #add noise
z = Lambda(lambda x: x / gumbel_temp)(z) #divide by temperature
out = Dense(num_actions, activation='softmax')(z)  #then softmax
model = Model(inp, out)

In [38]:
#Inputs
state_pl = model.input
action_grads_pl = K.placeholder(shape=(None,1))  

#Find grad_(pars) mu(state)
mu_pl = model.output     
entropy = K.sum(model.output * K.log(model.output + 1e-10), axis=1)
#loss = entropy - mu_pl 
#loss = 1000*entropy
pars = model.trainable_weights
#pars_grad_mu = tf.gradients(loss, pars, action_grads_pl)
func = K.function(inputs = [state_pl, action_grads_pl], outputs = [entropy, loss])

[entropy, loss] = func([states, grad_actions])
entropy, loss

#opt = Adam(lr)
#loss = pars_grad_mu  #placeholder, I won't use it
#updates = opt.get_updates(loss = loss, params = pars, grads = pars_grad_mu)

#func =  K.function(inputs = [state_pl, action_grads_pl], outputs = [], updates = updates)

InvalidArgumentError: Incompatible shapes: [32] vs. [32,2]
	 [[Node: sub_59 = Sub[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Sum_9, dense_40/Softmax)]]

In [39]:
func([states, grad_actions])

InvalidArgumentError: Incompatible shapes: [32] vs. [32,2]
	 [[Node: sub_59 = Sub[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Sum_9, dense_40/Softmax)]]