In [1]:
import gym
import sys
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, concatenate
from collections import deque
import matplotlib.pyplot as plt

In [2]:
class Actor(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.fc = Dense(64, activation='relu')
        self.out= Dense(action_size, activation='softmax', kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))
        # self.build(input_shape=[(None,) + state_size])
        # self.summary()

    def call(self, x):
        x      = self.fc(x)
        policy = self.out(x)
        return policy

class Critic(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        # self.s1 = Dense(16, activation='relu')
        # self.a1 = Dense(16, activation='relu')
        self.fc1= Dense(32, activation='relu')
        self.fc2= Dense(16, activation='relu')
        self.out= Dense(1, kernel_initializer=tf.keras.initializers.RandomUniform(-1e-3,1e-3))

        # self.build(input_shape=[(None,) + state_size, (None,) + action_size])
        # self.summary()

    def call(self,x):
        states = x[0]
        actions = np.transpose([x[1]])
        x = tf.concat([states,actions],axis=1)
        x = self.fc1(x)
        x = self.fc2(x)
        q = self.out(x)
        # x      = tf.keras.layers.concatenate([states, actions], axis=1)
        # x      = self.fc(x)
        # q_value= self.out(x)
        return q

In [3]:
class DDPGAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size= action_size
        
        # Hyper params for learning
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.tau = 0.001
        
        # Experience Replay
        self.batch_size = 64
        self.train_start = 1000
        self.memory = deque(maxlen=10000)
        
        self.actor         = Actor(self.state_size, self.action_size)
        self.critic        = Critic(self.state_size, self.action_size)
        self.target_actor  = Actor(self.state_size, self.action_size)
        self.target_critic = Critic(self.state_size, self.action_size)
        
        self.optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate)
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def update_target_model(self):
        # new_target_actor_weight  = self.tau * self.array(self.actor.get_weight())  + (1 - self.tau) * self.array(self.target_actor.get_weight())
        # new_target_critic_weight = self.tau * self.array(self.critic.get_weight()) + (1 - self.tau) * self.array(self.target_critic.get_weight())

        # self.target_actor.set_weight( new_target_actor_weight )
        # self.target_critic.set_weight(new_target_critic_weight)
        
    # def update_target(self,tau=1.0):
        # tau = max(0.0, min(tau, 1.0))
        tau = self.tau
        for (target_net, net) in zip(   self.actor.trainable_variables,
                                        self.target_actor.trainable_variables):
            target_net.assign(tau * target_net + (1.0 - tau) * net)
        for (target_net, net) in zip(   self.critic.trainable_variables,
                                        self.target_critic.trainable_variables):
            target_net.assign(tau * target_net + (1.0 - tau) * net)

    def get_action(self, state):
        policy = self.actor(state)
        policy = np.array(policy[0])
        return np.random.choice(self.action_size, 1, p=policy)[0]
        
    def train_model(self, state, action, reward, next_state, done):
        # Train from Experience Replay
        if len(self.memory) < self.train_start:
            return 0.0
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        mini_batch = random.sample(self.memory, self.batch_size)
        
        states      = np.array([sample[0][0] for sample in mini_batch])
        actions     = np.array([sample[1] for sample in mini_batch])
        rewards     = np.array([sample[2] for sample in mini_batch])
        next_states = np.array([sample[3][0] for sample in mini_batch])
        dones       = np.array([sample[4] for sample in mini_batch])
        
        critic_params = self.critic.trainable_variables
        with tf.GradientTape() as tape:
            # critic network
            print(states)
            print(actions)
            # print(np.transpose([actions]))
            # print(tf.concat([states,actions])],axis=1))
            critic_values  = self.critic([states, actions])
            # Target actor network
            target_actions = self.target_actor(next_states)
            print(target_actions)
            # Target critic network
            target_critic_values  = self.target_critic([next_states, target_actions])
            # Target Value
            target_value = rewards + (1 - dones) * target_critic_values[0]
            # TD-error and Critic Loss
            critic_loss = tf.reduce_mean(tf.square(critic_values - target_value))
            
        critic_grads = tape.gradient(critic_loss, critic_params)
        self.optimizer.apply_gradients(zip(critic_grads, critic_params))

        actor_params  = self.actor.trainable_variables
        with tf.GradientTape() as tape:
            # actor network
            actions_new = self.actor(states)
            # critic network
            critic_values  = self.critic([states, actions_new])
            # actor network
            actor_loss = -tf.reduce_mean(critic_values)
            
        actor_grads = tape.gradient(actor_loss, actor_params)
        self.optimizer.apply_gradients(zip(actor_grads, actor_params))

        return critic_loss
    
        

In [4]:
%matplotlib tk
if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DDPGAgent(state_size, action_size)

    scores, episodes, losses = [], [], []
    score_avg = 0
    
    end = False
    
    fig = plt.figure(1)
    fig.clf()
    
    num_episode = 2000
    for e in range(num_episode):
        done = False
        score = 0
        loss_list = []
        
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        
        while not done:
            # env.render()

            action = agent.get_action(state)

            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])

            score += reward
            reward = 0.1 if not done or score == 500 else -1

            agent.remember(state, action, reward, next_state, done)

            loss = agent.train_model(state, action, reward, next_state, done)
            loss_list.append(loss)

            state = next_state
            if done:
                agent.update_target_model()
                
                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print('epi: {:3d} | score avg {:3.2f} | mem length: {:4d} | epsilon: {:.4f} | loss: {:.4f}'
                      .format(e, score_avg, len(agent.memory), agent.epsilon, np.mean(loss_list)))

                scores.append(score_avg)
                episodes.append(e)
                losses.append(np.mean(loss_list))
                plt.subplot(211)
                plt.plot(episodes, scores, 'b')
                plt.xlabel('episode')
                plt.ylabel('average score')
                plt.title('cartpole DDPG')
                
                plt.subplot(212)
                plt.plot(episodes, losses, 'b')
                plt.xlabel('episode')
                plt.ylabel('loss')
                
                plt.savefig('./save_model/cartpole_ddpg.png')

                if score_avg > 400:
                    agent.model.save_weights('./save_model/cartpole_ddpg', save_format='tf')
                    end = True
                    break
        if end == True:
            np.save('./save_model/cartpole_ddpg_epi',episodes)
            np.save('./save_model/cartpole_ddpg_score',scores)
            np.save('./save_model/cartpole_ddpg_loss',losses)
            env.close()
            print("End")
            break

epi:   0 | score avg 19.00 | mem length:   19 | epsilon: 1.0000 | loss: 0.0000
epi:   1 | score avg 21.30 | mem length:   61 | epsilon: 1.0000 | loss: 0.0000
epi:   2 | score avg 20.67 | mem length:   76 | epsilon: 1.0000 | loss: 0.0000
epi:   3 | score avg 20.10 | mem length:   91 | epsilon: 1.0000 | loss: 0.0000
epi:   4 | score avg 20.49 | mem length:  115 | epsilon: 1.0000 | loss: 0.0000
epi:   5 | score avg 21.14 | mem length:  142 | epsilon: 1.0000 | loss: 0.0000
epi:   6 | score avg 21.13 | mem length:  163 | epsilon: 1.0000 | loss: 0.0000
epi:   7 | score avg 20.62 | mem length:  179 | epsilon: 1.0000 | loss: 0.0000
epi:   8 | score avg 20.35 | mem length:  197 | epsilon: 1.0000 | loss: 0.0000
epi:   9 | score avg 20.62 | mem length:  220 | epsilon: 1.0000 | loss: 0.0000
epi:  10 | score avg 20.76 | mem length:  242 | epsilon: 1.0000 | loss: 0.0000
epi:  11 | score avg 20.28 | mem length:  258 | epsilon: 1.0000 | loss: 0.0000
epi:  12 | score avg 19.15 | mem length:  267 | epsi

InvalidArgumentError: ConcatOp : Ranks of all input tensors should match: shape[0] = [64,4] vs. shape[1] = [2,64,1] [Op:ConcatV2] name: concat

In [None]:
    # num_episode = 1000
    # for e in range(num_episode,2*num_episode):
    #     done = False
    #     score = 0
    #     loss_list = []
        
    #     state = env.reset()
    #     state = np.reshape(state, [1, state_size])
        
    #     while not done:
    #         env.render()

    #         action = agent.get_action(state)

    #         next_state, reward, done, info = env.step(action)
    #         next_state = np.reshape(next_state, [1, state_size])

    #         score += reward
    #         reward = 0.1 if not done or score == 500 else -1

    #         loss = agent.train_model(state, action, reward, next_state, done)
    #         loss_list.append(loss)

    #         state = next_state
    #         if done:
                
    #             score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
    #             print('epi: {:3d} | score avg {:3.2f} | loss: {:.4f}'.format(e, score_avg, np.mean(loss_list)))

    #             scores.append(score_avg)
    #             episodes.append(e)
    #             losses.append(np.mean(loss_list))
    #             plt.subplot(211)
    #             plt.plot(episodes, scores, 'b')
    #             plt.xlabel('episode')
    #             plt.ylabel('average score')
    #             plt.title('cartpole A2C')
                
    #             plt.subplot(212)
    #             plt.plot(episodes, losses, 'b')
    #             plt.xlabel('episode')
    #             plt.ylabel('loss')
                
    #             plt.savefig('./save_model/cartpole_a2c.png')

    #             if score_avg > 400:
    #                 agent.model.save_weights('./save_model/cartpole_a2c', save_format='tf')
    #                 end = True
    #                 break
    #     if end == True:
    #         np.save('./save_model/cartpole_a2c_epi',episodes)
    #         np.save('./save_model/cartpole_a2c_score',scores)
    #         np.save('./save_model/cartpole_a2c_loss',losses)
    #         env.close()
    #         print("End")
    #         break