In [None]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices()
print(physical_devices)
# tf.config.experimental.set_memory_growth(physical_devices[1], True)

In [None]:
import numpy as np
import math
import random
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from collections import deque
import gym
import os
from tensorflow.keras.optimizers import Adam

In [None]:
env = gym.make("CartPole-v1")
obs = env.reset()
input_size = len(obs)
output_size = env.action_space.n


In [None]:
tau = 0.005
gamma = 0.99
l2_decay = 1e-2
actor_lr = 1e-4
critic_lr = 1e-3
memory_size = 1e6
minibatch_size = 128

num_episodes = 256


In [None]:
class Agent:
    def __init__(self):
        self.actor = self.build_actor()
        self.critic = self.build_critic()
        
        self.actor.compile(optimizer = Adam(lr = actor_lr))
        self.critic.compile(optimizer = Adam(lr = critic_lr))
        
        self.stable_actor = self.actor
        self.stable_critic = self.critic
        
        self.memory = deque()
        
        
    def build_actor(self):
        inputs = Input(shape = input_size)
        x = Dense(256, activation = 'relu')(inputs)
        x = Dense(64, activation = 'relu')(x)
        x = Dense(output_size, activation = "linear")(x)
        model = Model(inputs = inputs, outputs = x)
        return model
    
    def build_critic(self):
        inputs = Input(shape = input_size + output_size)
        x = Dense(256, activation = 'relu')(inputs)
        x = Dense(64, activation = 'relu')(x)
        x = Dense(1, activation = 'linear')(x)
        model = Model(inputs = inputs, outputs = x)
        return model
    
    def remember(self,state,action,reward,next_state,done):
        if len(self.memory)>memory_size:
            random.shuffle(self.memory)
            self.memory.popleft()
        self.memory.append((state,action,reward,next_state,done))
        
    @tf.function 
    def act(self, state, test = False):
        act_value = self.actor(state)
        if not test:
            act_value += tf.random.normal(shape = act_value.shape, mean=0.0, stddev=0.1)
        act_value = tf.nn.softmax(act_value)
        return act_value
      
    def learn(self):
        random.shuffle(self.memory)
        minibatch = random.sample(self.memory, minibatch_size)
        
        states, actions, rewards, next_states, dones = zip(*minibatch)
        states = np.array(states)
        next_states = np.array(next_states)
        
        actions = np.squeeze(actions)
        next_actions = self.stable_actor(next_states)
        next_actions = tf.nn.softmax(next_actions)
        
        target_q_values = rewards + gamma * self.stable_critic(tf.concat([next_states, next_actions], axis = 1))
        target_q_values = target_q_values.numpy()
        
        for i, done in enumerate(dones):
            if done:
                target_q_values[i] = rewards[i]
                
        with tf.GradientTape() as tape:        
            pred_q_values = self.critic(np.concatenate([states, actions], axis = 1))
            critic_loss = tf.reduce_mean((target_q_values - pred_q_values)**2)
        critic_grads = tape.gradient(critic_loss, self.critic.trainable_weights)
        self.critic.optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_weights))
        
        
        with tf.GradientTape() as tape:
            actions = self.actor(states)
            actions = tf.nn.softmax(actions)
            actor_loss = -self.critic(tf.concat([states, actions], axis = 1))
            actor_loss = tf.reduce_mean(actor_loss)
        actor_grads = tape.gradient(actor_loss, self.actor.trainable_weights)
        self.actor.optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_weights))
        self.update_target_weights()
        
    
    
    def update_target_weights(self):
        weights = []
        target_weights = self.stable_actor.get_weights()
        for i, weight in enumerate(self.actor.get_weights()):
            weights.append(weight * tau + target_weights[i] * (1-tau))
        self.stable_actor.set_weights(weights)
        
        weights = []
        target_weights = self.stable_critic.get_weights()
        for i, weight in enumerate(self.critic.get_weights()):
            weights.append(weight * tau + target_weights[i] * (1-tau))
        self.stable_critic.set_weights(weights)
    
    def load(self, name):
        self.actor.load_weights(name + "actor.hdf5")
        self.critic.load_weights(name + "critic.hdf5")
        self.stable_actor.load_weights(name + "actor.hdf5")
        self.stable_critic.load_weights(name + "critic.hdf5")
    
    def save(self, name):
        self.actor.save_weights(name + "actor.hdf5")
        self.critic.save_weights(name + "critic.hdf5")    

In [None]:
agent = Agent()

In [None]:
#Training 
best_score = env.reward_range[0]
score_history = []
for ep in range(num_episodes):
    state = env.reset()
    done = False
    score = 0
    while not done:
        action =  agent.act(np.expand_dims(state, axis = 0))
        next_state, reward, done, _ =  env.step(np.squeeze(np.argmax(action, axis =1)))
        score += reward
        agent.remember(state, action, reward, next_state, done)
        if len(agent.memory) > 4* minibatch_size:
            agent.learn()
        
        state = next_state
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    
    if score > best_score:
        best_score = score
        agent.save("DDPG_")
    print(f"Episode: {ep}, Score: {score}, Avg Score: {avg_score}")  

In [None]:
agent.load("DDPG_")

In [None]:
#Testing
done = False
state = env.reset()
score=0
t = 0
while not done:
    env.render()
    action = agent.act(np.expand_dims(state, axis=0), test=False)
    next_state, reward, done, _ = env.step(np.squeeze(np.argmax(action, axis = 1)))
    score+= reward
    state = next_state
    t += 1
    
print(f"Episode ended in {t} steps with score of {score}")
env.close()