### Solving control problem by using Actor Critic algorithm

#### import the neccessary package

In [1]:
import gym
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.layers.merge import Add, Multiply
from keras.optimizers import Adam
import keras.backend as K

import tensorflow as tf

import random
from collections import deque

Using TensorFlow backend.


#### load the environment

In [2]:
env = gym.make('Pendulum-v0').env
env.reset()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


array([-0.17865889,  0.98391107,  0.33466896])

NameError: name 'is_int' is not defined

#### Define Actor Model 

In [9]:
def create_actor_model():    
    state_input = Input(shape=env.observation_space.shape)
    h1 = Dense(24, activation='relu') (state_input)
    h2 = Dense(48, activation='relu') (h1)
    h3 = Dense(24, activation='relu') (h2)
    output = Dense(env.action_space.shape[0], activation='linear') (h3)
    
    model = Model(input= state_input, output = output)
    adam = Adam(lr=0.001)
    model.compile(loss = 'mse', optimizer =adam)
    return state_input, model

In [10]:
actor_state_input, actor_model = create_actor_model()
_, target_actor_model = create_actor_model()

  


In [14]:
actor_critic_grad = tf.placeholder(tf.float32, [None, env.action_space.shape[0]])
actor_model_weights = actor_model.trainable_weights

actor_grads = tf.gradients(actor_model.output, actor_model_weights, -actor_critic_grad)
grads = zip(actor_grads, actor_model_weights)
optimize = tf.train.AdamOptimizer(learning_rate=0.001).apply_gradients(grads)

In [15]:
actor_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 24)                96        
_________________________________________________________________
dense_2 (Dense)              (None, 48)                1200      
_________________________________________________________________
dense_3 (Dense)              (None, 24)                1176      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 25        
Total params: 2,497
Trainable params: 2,497
Non-trainable params: 0
_________________________________________________________________


#### Define critic model

In [126]:
def create_critic_model():
    state_input = Input(shape=env.observation_space.shape)
    state_h1 = Dense(24, activation='relu') (state_input)
    state_h2 = Dense(48) (state_h1)
    
    action_input = Input(shape=env.action_space.shape)
    action_h1 = Dense(48) (action_input)
    
    merged = Add() ([state_h2, action_h1])
    merged_h1 = Dense(24, activation='relu') (merged)
    output = Dense(1, activation='relu') (merged_h1)
    
    model = Model([state_input, action_input],output)
    
    adam = Adam(lr = 0.001)
    model.compile(loss = "mse", optimizer = adam)
    
    return state_input, action_input, model

In [127]:
critic_state_input, critic_action_input, critic_model = create_critic_model()
_,_,target_critic_model = create_critic_model()

In [128]:
critic_grads = tf.gradients(critic_model.output, critic_action_input)


In [129]:
critic_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           (None, 3)            0                                            
__________________________________________________________________________________________________
dense_39 (Dense)                (None, 24)           96          input_15[0][0]                   
__________________________________________________________________________________________________
input_16 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
dense_40 (Dense)                (None, 48)           1200        dense_39[0][0]                   
__________________________________________________________________________________________________
dense_41 (

#### Training both actor and critic

In [130]:
def train():
    batch_size = 32
    
    if len(memory) < batch_size:
        return
    
    rewards = []
    
    samples = random.sample(memory, batch_size)
    
    _train_critic(samples)
    _train_actor(samples)

In [131]:
def _train_critic(samples):
    
    for sample in samples:
        cur_state, action, reward, new_state, done = sample
        
        if not done:
            target_action = target_actor_model.predict(new_state)
            
            future_reward = target_critic_model.predict([new_state, target_action]) [0][0]
            
            reward += gamma * future_reward
            
            critic_model.fit([cur_state, action], reward,verbose=0)

In [132]:
def _train_actor(samples):
    for sample in samples:
        cur_state, action, reward, new_state, _ = sample
        
        predicted_action = actor_model.predict(cur_state)
        
        with tf.Session() as sess:
            grads = sess.run(critic_grads, feed_dict= {critic_state_input: cur_state, critic_action_input: predicted_action})[0]
            
            sess.run(optimize, feed_dict= {actor_state_input:cur_state, actor_critic_grad:grads})

In [133]:
actor_model.save_weights('./rand_actor')
critic_model.save_weights('./rand_critic')

In [134]:
episode = 1
max_episode_length = 500
gamma = 0.9

memory = deque(maxlen = 2000)

In [135]:
for i in range(episode):
    
    #replace the experience replay 
    if ((i+1) % 25) == 0:
        episode_experience = episode_experience[-2000:]
        
        #replace old network with new network
        actor_model.save_weights('./actor_weight')
        
        target_actor_model.load_weights('./actor_weight')
        
        actor_model.load_weights('./rand_actor')
        
        #replace old network with new network
        critic_model.save_weights('./critic_weight')
        
        target_critic_model.load_weights('./critic_weight')
        
        critic_model.load_weights('./rand_critic')
        
    
    epsilon = 1 / (np.log(i+10))
    
    S = env.reset()
    S = S.reshape((1,env.observation_space.shape[0]))
    S = S[0]
    
    A = env.action_space.sample()
    A = A.reshape((1,env.action_space.shape[0]))

    for j in range(max_episode_length):
        
        S_new, R, done, _ = env.step([A])
        
        S_new = S_new.reshape((1,env.observation_space.shape[0]))
        
        if R < -.5:
            R = -1
        elif done:
            R = 1
        else :
            R = -0.5
       
        
        #chose action which is maximum value 
        A_new = actor_model.predict(S_new)
        A_new = A_new.reshape((1,env.action_space.shape[0]))
        
        
        #epsilon - greedy 
        rand = np.random.uniform([0])
        
        if rand[0] > epsilon:
            A_new = A_new
        else :
            A_new = np.random.normal(0,1)
        
        memory.append([S,A,R,S_new,done])
        
        train()
            
        S = S_new
        A = A_new
        
        if done:
            env.reset()
            
            S = env.reset()
            S = S[0]
    
            A = np.random.normal(0,1)
    

AttributeError: 'float' object has no attribute 'ndim'

In [136]:
samples = random.sample(memory,32)

In [137]:
for sample in samples:
    cur_state, action, reward, new_state, done = sample
        
    if not done:
        target_action = target_actor_model.predict(new_state)
            
        future_reward = target_critic_model.predict([new_state, target_action]) [0][0]
            
        reward += gamma * future_reward
        
        critic_model.fit([cur_state, action], reward,verbose=0)

AttributeError: 'float' object has no attribute 'ndim'

In [138]:
target_critic_model.fit([cur_state,action],np.array(reward),epochs=1)

AttributeError: 'float' object has no attribute 'ndim'

In [122]:
target_critic_model.predict([cur_state,action])[0][0]

-0.04812424