In [None]:
!pip install --upgrade pip
!pip install mujoco
!pip install gymnasium[mujoco]
!pip install --upgrade tensorflow
!pip install highway-env

In [None]:
from highway_env.envs.common.action import ContinuousAction

In [None]:
import warnings

# Disable all warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
import gymnasium as gym
import random
from collections import deque
import numpy as np
import pdb
import datetime
import wandb
import os
import pdb

from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, BatchNormalization, Lambda, Dropout, Conv2D, Flatten
from tensorflow.keras.losses import mean_squared_error
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.initializers import glorot_normal, RandomUniform
from tensorflow.keras.regularizers import l2

tf.keras.utils.disable_interactive_logging()

In [None]:
ENV_NAME = "parking-v0"
KER_REG  = l2(0.01)
OBSERVATION_SHAPE = (128, 128, 3)

In [None]:
wandb.login(key="YOUR_WANDB_API_KEY_HERE")
run = wandb.init(project="Deep-Learning-Project", name=f"{ENV_NAME} DDPG", resume=None)

## Replay Buffer

In [None]:
class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.buffer = deque(maxlen=buffer_size)

    def add(self, state, action, reward, next_state, done):
        experience = (state, action, reward, next_state, done)
        self.buffer.append(experience)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = map(np.asarray, zip(*batch))
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

    def is_full(self):
        return len(self.buffer) == self.buffer_size

    def clear(self):
        self.buffer.clear()

In [None]:
def custom_initializer(fan_in):
    limit = 1.0 / np.sqrt(fan_in)
    return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)


## Actor Network

In [None]:
class Actor:

  def __init__(self, actor_lr, observation_shape, action_shape, action_bound):
    self.observation_shape = observation_shape
    self.action_shape = list(action_shape)[0]
    self.opt = Adam(learning_rate=actor_lr)
    self.action_bound = action_bound
    self.model = self.create_model()

  def create_model(self):

    # The actor is a deep neural network taking as input a state and predict the action for that state

    # Input shape = observation space shape; Output shape = action space shape
    model = Sequential()

    model.add(Input(shape=self.observation_shape))
    model.add(BatchNormalization())
    
    model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_initializer=custom_initializer(27)))
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_initializer=custom_initializer(288)))
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_initializer=custom_initializer(288)))
    model.add(BatchNormalization())
    
    model.add(Flatten())
    
    model.add(Dense(200, activation='relu', kernel_initializer=custom_initializer(288)))
    model.add(BatchNormalization())
    model.add(Dense(200, activation='relu', kernel_initializer=custom_initializer(200)))
    model.add(BatchNormalization())

    model.add(Dense(self.action_shape, activation='tanh', kernel_initializer=RandomUniform(minval=-0.0003, maxval=0.0003)))
    
    return model


  def get_weights(self):
    return self.model.get_weights()


  def set_weights(self, weights):
    return self.model.set_weights(weights)


  def predict(self, states):
    return self.model(states, training=False)


  def train(self, states, critic):
    with tf.GradientTape() as tape:
      predicted_actions = self.model(states, training=True)
      actor_loss = -tf.math.reduce_mean(critic.predict(states, predicted_actions))
      gradient = tape.gradient(actor_loss, self.model.trainable_variables)
    self.opt.apply_gradients(zip(gradient, self.model.trainable_variables))
    return actor_loss

## Critic Network

In [None]:
class Critic:

  def __init__(self, critic_lr, observation_shape, action_shape):
    self.observation_shape = observation_shape
    self.action_shape = action_shape
    self.input_shape = tuple(x+y for x, y in zip(observation_shape, action_shape))
    self.opt = Adam(learning_rate=critic_lr)
    self.model = self.create_model()

  def create_model(self):
    
    # State
    state_input = Input(self.observation_shape)
    normalized_input = BatchNormalization()(state_input)
    
    s0 = Conv2D(32, (3, 3), activation='relu', kernel_initializer=custom_initializer(27), kernel_regularizer=KER_REG)(normalized_input)
    s0 = BatchNormalization()(s0)
    s1 = Conv2D(32, (3, 3), activation='relu', kernel_initializer=custom_initializer(288), kernel_regularizer=KER_REG)(s0)
    s1 = BatchNormalization()(s1)
    s2 = Conv2D(32, (3, 3), activation='relu', kernel_initializer=custom_initializer(288), kernel_regularizer=KER_REG)(s1)
    s2 = BatchNormalization()(s2)
    
    flatten = Flatten()(s2)
    
    fc = Dense(100, activation='relu', kernel_initializer=custom_initializer(150), kernel_regularizer=KER_REG)(flatten)
    fc = BatchNormalization()(fc)

    # Action
    action_input = Input(self.action_shape)
    action_input = BatchNormalization()(action_input)
    
    a1 = Dense(100, activation='relu', kernel_initializer=custom_initializer(100), kernel_regularizer=KER_REG)(action_input)
    a1 = BatchNormalization()(a1)
    
    fc1 = tf.concat([fc, a1], axis=-1)
    fc1 = BatchNormalization()(fc1)
    
    fc2 = Dense(200, activation='relu', kernel_initializer=custom_initializer(100), kernel_regularizer=KER_REG)(fc1)
    
    output = Dense(1, activation='linear', kernel_initializer=RandomUniform(minval=-0.0003, maxval=0.0003))(fc2)

    return tf.keras.Model([state_input, action_input], output)


  def get_weights(self):
    return self.model.get_weights()


  def set_weights(self, weights):
    return self.model.set_weights(weights)


  def q_grads(self, states, actions):
    with tf.GradientTape() as tape:
        tape.watch(actions)
        q_values = self.model([states, actions], training=False)
    grads = tape.gradient(q_values, actions)
    return grads

    
  def predict(self, states, actions):
    input_data = [states, actions]
    return self.model(input_data, training=False)


  def train(self, states, actions, target_critic):      
    with tf.GradientTape() as tape:

      # Concatenate the states and action to form input for critic network
      policy_critic = tf.squeeze(self.model([states, actions], training=True))

      # Compute MSE loss between target_critic and current policy critic
      loss = mean_squared_error(target_critic, policy_critic)
      gradient = tape.gradient(loss, self.model.trainable_variables)
    
    self.opt.apply_gradients(zip(gradient, self.model.trainable_variables))
#     pdb.set_trace()
    return loss

In [None]:
CONFIG = {
    "action": {
        "type": "ContinuousAction",
    },
    "observation": {
       "type": "GrayscaleObservation",
       "observation_shape": (128, 128),
       "stack_size": 3,
       "weights": [0.2989, 0.5870, 0.1140],  # weights for RGB conversion
    },
    "render_agent": False,
}

env = gym.make(ENV_NAME)
env.configure(CONFIG)
continuous_env = ContinuousAction(ENV_NAME)

class Agent:

  def __init__(self, env_name, actor_lr, critic_lr, batch_size, buffer_size, tau, gamma, train_start, start_steps):

    self.env = gym.make(env_name)
    self.env.configure(CONFIG)
    
    self.observation_shape = OBSERVATION_SHAPE
    self.action_shape = continuous_env.space().shape
    
    self.tau = tau
    self.gamma = gamma
    self.buffer_size = buffer_size
    self.batch_size = batch_size
    self.train_start = train_start
    self.start_steps = start_steps
    
    self.continue_training = False    

    # Initialize Replay Buffer
    self.replay_buffer = ReplayBuffer(buffer_size)

    bound = 1
    
    # Initialize actor and critic networks
    self.actor = Actor(actor_lr, self.observation_shape, self.action_shape, bound)
    self.critic = Critic(critic_lr, self.observation_shape, self.action_shape)

    # Initialize target actor and target critic networks
    self.target_actor = Actor(actor_lr, self.observation_shape, self.action_shape, bound)
    self.target_critic = Critic(critic_lr, self.observation_shape, self.action_shape)
    self.target_actor.set_weights(self.actor.get_weights())
    self.target_critic.set_weights(self.critic.get_weights())
    
    # Create checkpoint for each object
    self.actor_cpt = tf.train.Checkpoint(model=self.actor.model)
    self.critic_cpt = tf.train.Checkpoint(model=self.critic.model)
    self.t_actor_cpt = tf.train.Checkpoint(model=self.target_actor.model)
    self.t_critic_cpt = tf.train.Checkpoint(model=self.target_critic.model)

    # Paths for pretrained weights
    actor_cpt_dir = f"./{env_name}/actor"
    critic_cpt_dir = f"./{env_name}/critic"
    target_actor_cpt_dir = f"./{env_name}/target_actor"
    target_critic_cpt_dir = f"./{env_name}/target_critic"
    
    self.actor_cpt_manager = tf.train.CheckpointManager(self.actor_cpt, actor_cpt_dir, max_to_keep=5)
    self.critic_cpt_manager = tf.train.CheckpointManager(self.critic_cpt, critic_cpt_dir, max_to_keep=5)
    self.t_actor_cpt_manager = tf.train.CheckpointManager(self.t_actor_cpt, target_actor_cpt_dir, max_to_keep=5)
    self.t_critic_cpt_manager = tf.train.CheckpointManager(self.t_critic_cpt, target_critic_cpt_dir, max_to_keep=5)

    # Load pretrained weights if any
    self.init_weights()
    
    
  # Define a function to generate checkpoint filenames with a timestamp
  def checkpoint_name(self, ckpt_dir):
    return os.path.join(ckpt_dir, "ckpt")

    
  def restore(self, ckpt, manager):
    if(manager.latest_checkpoint):
        print("Restoring from latest checkpoint...")
        ckpt.restore(manager.latest_checkpoint)
        self.continue_training = True
    else:
        print("Starting from scratch...")
    
    
  def init_weights(self):
    self.restore(self.actor_cpt, self.actor_cpt_manager)
    self.restore(self.critic_cpt, self.critic_cpt_manager)
    self.restore(self.t_actor_cpt, self.t_actor_cpt_manager)
    self.restore(self.t_critic_cpt, self.t_critic_cpt_manager)
    
        
  def save_weights(self):
    self.actor_cpt_manager.save()
    self.critic_cpt_manager.save()
    self.t_actor_cpt_manager.save()
    self.t_critic_cpt_manager.save()
    

  def update_target(self):
    # theta = tau * theta + (1-tau) * theta
    theta_actor = self.actor.get_weights()
    theta_critic = self.critic.get_weights()

    theta_target_actor = self.target_actor.get_weights()
    theta_target_critic = self.target_critic.get_weights()

    for i in range(len(theta_target_actor)):
      theta_target_actor[i] = self.tau * theta_actor[i] + (1 - self.tau) * theta_target_actor[i]

    for i in range(len(theta_target_critic)):
      theta_target_critic[i] = self.tau * theta_critic[i] + (1 - self.tau) * theta_target_critic[i]

    self.target_actor.set_weights(theta_target_actor)
    self.target_critic.set_weights(theta_target_critic)
    
    
  def compute_target_critic_value(self, next_states, rewards, dones, target_actor, target_critic):

    # Action resulting from target policy
    predicted_actions = target_actor.predict(next_states)

    # Target critic value
    target_critic_prediction = tf.squeeze(target_critic.predict(next_states, predicted_actions))
        
    y = rewards + self.gamma * (1-dones) * target_critic_prediction

    return y


  def ou_noise(self, x, rho=0.15, mu=0, sigma=0.2, dt=0.001, dim=1):
    return x + rho * (mu-x) * dt + sigma * np.sqrt(dt) * np.random.randn(dim)


  def explore(self, state, bg_noise):
    bound = 1

    # Get exploration noise
    noise = self.ou_noise(bg_noise, dim=self.action_shape[0])

    # Predict action for current state
    action = self.actor.model(np.array([state]), training=False)[0]
    action = np.clip(action + noise, -bound, bound)
    
    # Return sum of action + noise
    return action, noise


  def replay(self):
    states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)
    
    # Convert to tensor
    states = tf.convert_to_tensor(states, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.float32)
    rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
    next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)

    # Compute target critic
    y = self.compute_target_critic_value(next_states, rewards, dones, self.target_actor, self.target_critic)

    # Update Critic
    critic_loss = self.critic.train(states, actions, y)
    
    # Update Actor
    actor_loss = self.actor.train(states, self.critic)

    # Update Target Actor and Critic
    self.update_target()
    
    return actor_loss, critic_loss
    

  def train(self):
    global_steps = 0
    while(global_steps < 1e6):
        
      # Receive initial observation state
      state = self.env.reset(seed=42)[0]
      state = np.transpose(state, axes=(1, 2, 0))

      episode_reward, done, truncated = 0, False, False
      steps = 0
    
      bg_noise = np.zeros(self.action_shape[0])

      while not (done or truncated):
        
        # Take action according to the current policy and exploration noise
        if(global_steps < self.start_steps and not self.continue_training):
          # Uniform-random action selection, before running real policy. Helps exploration.
          action = self.env.action_space.sample()
        else:
          action, bg_noise = self.explore(state, bg_noise)

        # Execute action a and observe reward and transition
        next_state, reward, done, truncated, info = self.env.step(action)
        next_state = np.transpose(next_state, axes=(1, 2, 0))
        
        # Add the (state, action, reward, next_state) to the replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Update current state
        state = next_state
    
        # Update episodic reward
        episode_reward += reward
        
        global_steps += 1
        steps += 1
        
        run.log({"Reward": reward})

        if((global_steps > self.train_start) and (steps % 5 == 0)):
          actor_loss, critic_loss = self.replay()
          run.log({"Actor Loss": actor_loss, "Critic Loss": critic_loss})

        if(global_steps > self.train_start and steps % 20 == 0):
          print("Saving models...")
          self.save_weights()
          print("Done !")
    
      # Log Episodic Reward to WandB
      run.log({"Episode length": steps, "Episode Reward": episode_reward})

In [None]:
def train_agent(env_name=ENV_NAME, actor_lr=0.0001, critic_lr=0.001,
                batch_size=16, buffer_size=1000000, gamma=0.99, tau=0.001, 
                train_start=1000, start_steps=1000):
    try:
        with tf.device('/GPU:0'):
            agent = Agent(env_name, actor_lr, critic_lr, batch_size, buffer_size, tau, gamma, train_start, start_steps)
            agent.train()
    except KeyboardInterrupt:
        print("Terminating training process...")

In [None]:
train_agent()