### Introduction

Here I'm developing code from the A2C algorithm. I'm following the code from 

https://github.com/germain-hug/Deep-RL-Keras/blob/master/A2C/

In [40]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam 
from keras.optimizers import RMSprop
from keras import backend as K
from keras.utils import to_categorical


#Sub classes
from actor import Actor
from critic import Critic


class Agent:
       
    def __init__(self,input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.actions = range(output_dim)  
        self.lr = 0.001
        self.gamma = 0.999
        
        
        #Make actor and critic models
        self.actor = Actor(input_dim,output_dim,self.lr)
        self.critic = Critic(input_dim,output_dim, self.lr)
        
        self.train_actor = self.actor.optimizer()
        self.train_critic = self.critic.optimizer()
            
    
    def train_models(self,state,action,reward,next_state):
                
        #Put into right format (as tensors)
        action_onehot = to_categorical(action,self.output_dim)  #easier to work with one-hot
        actions = np.array([action_onehot]) 
        rewards = np.array([reward])
            
        #Compute inputs for the optimizers
        value_state = self.critic.model.predict(state)
        value_next_state = self.critic.model.predict(next_state)
        
        advantages = reward + self.gamma*value_next_state[0] - value_state[0]
        
        #Do the training
        self.train_actor([state,actions,advantages])
        self.train_critic([state,advantages])
        
        
    def find_discounted_return(self,rewards):
        R = np.zeros_like(rewards)
        rolling_sum = 0
        for t in reversed(range(len(R))):
            rolling_sum = rolling_sum*self.gamma + rewards[t]
            R[t] = rolling_sum
        return np.array(R)
    
        

    def act(self, state):
        """ Choose action according to softmax """
        
        probs =  self.actor.model.predict(state)[0]
        action = np.random.choice(self.actions, p=probs)
        return action

### Test

In [42]:
import numpy as np
import matplotlib.pyplot as plt
import gym
%matplotlib inline


EPISODES = 1000

env = gym.make('CartPole-v1')
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n
agent = Agent(num_states, num_actions)
agent.gamma = 0.999
# agent.load("./save/cartpole-dqn.h5")
done = False

scores = []

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, num_states])
    reward_sum = 0
    for time in range(500):
        
        #Do main step
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        reward_sum += reward
        next_state = np.reshape(next_state, [1, num_states])
        
        #Learn
        agent.train_models(state,action,reward,next_state)
        state = next_state
        if done:
            break
    
    #Learn & print results
    scores.append(reward_sum)
    if e % 50 == 0:
        print '(episode, score) = ' + str((e,reward_sum))

[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m
(episode, score) = (0, 4.0)
(episode, score) = (50, 18.0)
(episode, score) = (100, 11.0)
(episode, score) = (150, 136.0)
(episode, score) = (200, 38.0)
(episode, score) = (250, 20.0)
(episode, score) = (300, 159.0)
(episode, score) = (350, 45.0)
(episode, score) = (400, 28.0)
(episode, score) = (450, 489.0)
(episode, score) = (500, 114.0)
(episode, score) = (550, 187.0)
(episode, score) = (600, 94.0)
(episode, score) = (650, 3.0)
(episode, score) = (700, 239.0)
(episode, score) = (750, 146.0)
(episode, score) = (800, 141.0)
(episode, score) = (850, 117.0)
(episode, score) = (900, 3.0)
(episode, score) = (950, 18.0)


Huh -- it seems to be worse!