# Introduction

Developing PPO here. Have to figure out how to do multiple updates.

In [54]:
""" REINFORCE agent with baseline """


import numpy as np
import tensorflow as tf
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras import backend as K
from keras.utils import to_categorical


class Agent:
    
    def __init__(self, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = 16
        self.lr = 0.001
        self.gamma = 0.99
        self.EPOCHS = 2
        
        #Agents memory
        self.states = []
        self.actions = []
        self.rewards = []
        self.states_next = []
    
        self.actor = Actor(input_dim, output_dim, self.lr)
        self.critic = Critic(input_dim, output_dim, self.lr)
    
        
    def act(self,state):
        probs = self.actor.model.predict(state)[0]
        actions = list(range(self.output_dim))
        action = np.random.choice(actions, p = probs)
        return action
    
    
    def remember(self, state, action, reward, state_next):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.states_next.append(state_next)
    
    def learn(self):
        
        #Sample 
        S = np.array(self.states)
        A = np.array(self.actions)
        R = np.array(self.rewards)
        S1 = np.array(self.states_next)
        
        #Change A to one-hot
        A_onehot = to_categorical(A, self.output_dim)
        
        #Find advantage
        G = self.find_discounted_return(R)
        V = self.critic.model.predict(S)
        V.resize(len(V))  #spits out a tensor
        
        V1 = self.critic.model.predict(S1)
        V1.resize(len(V1))

        #Adv = R + self.gamma*V1 - V
        Adv = G - V
        
        #Learn: do first epoch, r(theta) = 1 for this
        pi_old = self.actor.model.predict(S)             #pi(s_t)
        pi_old = K.sum(A_onehot*pi_old, axis=1)  #pi(s_t, a_t)
        self.actor.train(S,A_onehot,Adv,pi_old)
        self.critic.train(S,G)
        
        #Then iterate
        for i in range(self.EPOCHS-1):
            pi_curr = self.actor.model.predict(S)             #pi(s_t)
            pi_curr = K.sum(A_onehot*pi_curr, axis=1)  #pi(s_t, a_t)
            
            self.actor.train(S,A_onehot,Adv,pi_old)
            self.critic.train(S,G)
            
            pi_old = pi_curr
            

        #Clear memory
        self.states = []
        self.actions = []
        self.rewards = []
        self.states_next = []
        
        
    def find_discounted_return(self,R):
        R_discounted = np.zeros_like(R)
        running_total = 0
        for t in reversed(range(len(R_discounted))):
            running_total = running_total*self.gamma + R[t]
            R_discounted[t] = running_total
        R_discounted -= np.mean(R_discounted)
        R_discounted /= np.std(R_discounted)
        return R_discounted
    
    
    
#-------------------------------------------------------------------------------------------------------------------

    
    
class Critic:
    def __init__(self,input_dim, output_dim, lr):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.lr = lr
        self.hidden_dim = 32
        self.model = self._build_model()
        self.opt = self.optimizer()
        
    def train(self,S,G):
        self.opt([S,G])
        
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(self.hidden_dim, input_dim = self.input_dim, activation = 'relu'))
        model.add(Dense(self.hidden_dim, activation = 'relu'))
        model.add(Dense(1, activation = 'linear'))
        model.compile(optimizer=Adam(lr=self.lr), loss='mse')
        return model
    
    
    def optimizer(self):
        """
           L = E_t()
        """
        
        #Placeholders
        S_pl = self.model.input
        V_pl = self.model.output
        G_pl = K.placeholder(name='discounted_return', shape=(None,))
        
        #loss
        loss = K.mean( K.square(V_pl - G_pl) )
        
        #Get updates
        opt = Adam(self.lr)
        pars = self.model.trainable_weights
        updates = opt.get_updates(loss = loss, params = pars)
        
        return K.function(inputs=[S_pl, G_pl], outputs = [], updates = updates)
    
    
#-------------------------------------------------------------------------------------------------------------------

    

class Actor:
    def __init__(self,input_dim, output_dim, lr):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.lr = lr
        self.hidden_dim = 32
        self.alpha = 0.1  #entropy hyperparameter
        self.model = self._build_model()
        self.opt = self.optimizer()
        
        
    def train(self,S, A_onehot, adv, pi_old):
        self.opt([S,A_onehot, adv, pi_old])
        
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(self.hidden_dim, input_dim = self.input_dim, activation = 'relu'))
        model.add(Dense(self.hidden_dim, activation = 'relu'))
        model.add(Dense(self.output_dim, activation = 'softmax'))
        return model
    
    
    def optimizer(self):
        """
        gradL = - E_{t} * ( Adv(t)*grad_{\theta} log(\pi(s_t, a_t)) )
        
        where E_{t} is the average over an episode
        
        """
        
        #Placeholders
        state_pl = self.model.input
        action_onehot_pl = K.placeholder(name='action_onehot', shape=(None,self.output_dim))
        adv_pl = K.placeholder(name='advantage', shape=(None,))
        pi_old_pl = K.placeholder(name='pi_old', shape=(None,self.output_dim))
        
        #Set up loss
        pi_pl = self.model.output
        pi_new = K.sum(action_onehot_pl*pi_pl, axis=1)
        
        r_vec = pi_new / pi_old_pl
        loss_vec = -r_vec*K.stop_gradient(adv_pl)
        loss_0 = K.mean(loss_vec)
        
        #Add entropy to loss
        entropy = K.mean(pi_pl*K.log(pi_pl))
        
        #Total loss
        loss = loss_0 + self.alpha*entropy
        
        #Get updates
        opt = Adam(self.lr)
        pars = self.model.trainable_weights
        updates = opt.get_updates(loss = loss, params = pars)
        
        return K.function(inputs=[state_pl, action_onehot_pl, adv_pl, pi_old_pl], outputs = [], updates = updates)

In [55]:
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#Setup
env = gym.make('CartPole-v0')
input_dim, output_dim = env.observation_space.shape[0], env.action_space.n
agent = Agent(input_dim, output_dim)

EPISODES = 10**2
scores = []
for e in range(1,EPISODES+1):
    state = env.reset()
    state = np.reshape(state,[1,  input_dim])
    reward_sum = 0
    done = False
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, input_dim])
        agent.remember(state[0], action, reward, next_state[0])
        state = next_state
        reward_sum += reward
        if done:
            break
    agent.learn()
    scores.append(reward_sum)
    if e % 10 == 0:
        print('episode, reward = {}, {}'.format(e,reward_sum))
plt.plot(scores)

InvalidArgumentError: Expected multiples argument to be a vector of length 2 but got length 1
	 [[{{node gradients_42/Mean_66_grad/Tile}}]]

### Multiple updates