In [None]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices()

print(physical_devices)
#If You have gpu uncomment the line below to speed up the training process
# tf.config.experimental.set_memory_growth(physical_devices[1], True) 

In [None]:
from copy import copy
import gym
from gym_pybullet_drones.envs.BaseAviary import DroneModel, Physics
from collections import OrderedDict
from Base import MyAviary

import matplotlib.pyplot as plt
import numpy as np
import math
import random
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, LayerNormalization
from collections import deque
import os
from tensorflow.keras.optimizers import Adam

In [None]:
input_size=18
output_size=4

In [None]:
tau = 0.005            #soft target update
gamma = 0.99             #discount factor

actor_lr = 1e-4    
critic_lr = 1e-4

memory_size = int(1e6) 
minibatch_size = 256

num_episodes = 10_000   #max episodes for training
num_steps=1_000          #max number of steps in an episode

noise_std=0.2      #stddev of noise added for exploration
start_after = 2_000     #pure exploration before 1000 steps
update_after = 1_000    #start learning after 1000 steps


In [None]:
# Tensorboard for monitoring the training process

import datetime
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = "logs/train_DDPG/" + current_time
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
%load_ext tensorboard
%tensorboard --logdir {train_log_dir}

In [None]:
class ReplayMemory:
    def __init__(self, input_size, output_size, size):
        self.states = np.zeros(shape = (size, input_size), dtype=np.float32)
        self.next_states = np.zeros(shape = (size, input_size), dtype=np.float32)
        self.actions = np.zeros(shape = (size, output_size), dtype=np.float32)
        self.rewards = np.zeros(shape = size, dtype=np.float32)
        self.dones = np.zeros(shape = size, dtype=np.float32)
        self.pointer=0
        self.size=0
        self.max_size=size
    
    def remember(self, state, action, reward, next_state, done):
        self.states[self.pointer] = state
        self.next_states[self.pointer] = next_state
        self.actions[self.pointer] = action
        self.rewards[self.pointer] =  reward
        self.dones[self.pointer] = done
        self.pointer = (self.pointer + 1)%self.max_size
        self.size = min(self.size+1, self.max_size)
        
    def sample(self, minibatch_size = 32):
        idxs = np.random.randint(0, self.size, size=minibatch_size)
        
        return self.states[idxs],\
               self.actions[idxs],\
               self.rewards[idxs],\
               self.next_states[idxs],\
               self.dones[idxs]
                
    

In [None]:
class Agent:
    def __init__(self):
        
        self.actor = self.build_actor()
        self.critic_1 = self.build_critic()
        self.critic_2 = self.build_critic()
        
        self.target_actor = copy(self.actor)
        self.target_critic_1 = copy(self.critic_1)
        self.target_critic_2 = copy(self.critic_2)
        
        self.actor_optimizer = Adam(lr=actor_lr)
        self.critic_optimizer =  Adam(lr=critic_lr)
        
        self.memory = ReplayMemory(input_size, output_size, memory_size)
        self.learn_count = 0
        self.noise_std = noise_std
        self.actor_update_itr = 2
        
    def build_actor(self):
        final_init = tf.random_uniform_initializer(minval=-3e-4, maxval=3e-4)
        
        inputs = Input(shape = input_size)
        x = Dense(400)(inputs)
        x = tf.nn.tanh(x)
        x = Dense(300)(x)
        x = tf.nn.tanh(x)
        x = Dense(output_size, kernel_initializer=final_init)(x)
        x = tf.nn.tanh(x)
        
        model = Model(inputs = inputs, outputs = x)
        
        return model
    
    def build_critic(self):
        final_init = tf.random_uniform_initializer(minval=-3e-4, maxval=3e-4)
        
        input1 = Input(shape = input_size)
        input2 = Input(shape = output_size)
        x = Dense(400)(tf.concat([input1, input2], axis =1))
        x = tf.nn.tanh(x)
        x = Dense(300)(x)
        x = tf.nn.tanh(x)
        x = Dense(1, activation = 'linear', kernel_initializer=final_init)(x)
        
        model = Model(inputs = [input1, input2], outputs = x)
        
        return model
    
        
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
    @tf.function
    def act(self, state, test=False):
        act_value = self.actor(state)
        if test:
            return act_value
        else:
            noise = tf.random.normal(shape = act_value.shape, mean=0.0, stddev = self.noise_std)
            act_value += tf.clip_by_value(noise, -0.5, 0.5)
            return tf.clip_by_value(act_value, -1, 1)
                  
    @tf.function
    def critic_learn(self, states, actions, rewards, next_states, dones):
        rewards= tf.expand_dims(rewards, axis=1)
        dones = tf.expand_dims(dones, axis=1)
      
        next_actions = self.target_actor(next_states)
        next_actions += tf.clip_by_value(tf.random.normal(shape = next_actions.shape, mean = 0.0, stddev = 0.2), -0.5, 0.5)
        next_actions = tf.clip_by_value(next_actions, -1, 1)
        next_q_1 = self.target_critic_1([next_states, next_actions])
        next_q_2 = self.target_critic_2([next_states, next_actions]) 
        next_values = tf.math.minimum(next_q_1, next_q_2)
        target_q_values = rewards + gamma * (1-dones) * next_values
        with tf.GradientTape(persistent=True) as tape:    
            pred_q_1 = self.critic_1([states, actions])
            pred_q_2 = self.critic_2([states, actions])
            critic_loss = tf.reduce_mean((target_q_values - pred_q_1)**2) + tf.reduce_mean((target_q_values - pred_q_2)**2)
        critic_1_grads = tape.gradient(critic_loss, self.critic_1.trainable_weights)
        critic_2_grads = tape.gradient(critic_loss, self.critic_2.trainable_weights)
        self.critic_optimizer.apply_gradients(zip(critic_1_grads, self.critic_1.trainable_weights))
        self.critic_optimizer.apply_gradients(zip(critic_2_grads, self.critic_2.trainable_weights))
        return critic_loss

    @tf.function
    def actor_learn(self, states, actions, rewards, next_states, dones):
        with tf.GradientTape() as tape:
            actions_pred = self.actor(states)
            actor_loss = self.critic_1([states, actions_pred])
            actor_loss = -tf.reduce_mean(actor_loss)
        actor_grads = tape.gradient(actor_loss, self.actor.trainable_weights)
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_weights))
        return actor_loss
    
    def learn(self):
        if agent.memory.size<minibatch_size:
            return 
        states, actions, rewards, next_states, dones = self.memory.sample(minibatch_size)
        critic_loss = self.critic_learn(states, actions, rewards, next_states, dones) 
        self.learn_count += 1
        
        with train_summary_writer.as_default():
            tf.summary.scalar("Charts/critic_loss", critic_loss, self.learn_count)
            
        if self.learn_count%self.actor_update_itr != 0:
            return
        
        actor_loss = self.actor_learn(states, actions, rewards, next_states, dones)
        
        with train_summary_writer.as_default():
            tf.summary.scalar("Charts/actor_loss", actor_loss, self.learn_count)
            
        self.update_target_weights()
        
    def update_target_weights(self):
        target_actor_weights = self.target_actor.get_weights()
        weights=[]
        for i, weight in enumerate(self.actor.get_weights()):
            weights.append(weight * tau + target_actor_weights[i] * (1-tau))
        self.target_actor.set_weights(weights)
                           
        target_critic_1_weights = self.target_critic_1.get_weights()
        weights=[]
        for i, weight in enumerate(self.critic_1.get_weights()):
            weights.append(weight * tau + target_critic_1_weights[i] * (1-tau))
        self.target_critic_1.set_weights(weights)
        
        target_critic_2_weights = self.target_critic_2.get_weights()
        weights=[]
        for i, weight in enumerate(self.critic_2.get_weights()):
            weights.append(weight * tau + target_critic_2_weights[i] * (1-tau))
        self.target_critic_2.set_weights(weights)
                           
    def load(self, name):
        self.actor.load_weights(name + "actor.hdf5")
        self.critic_1.load_weights(name + "critic_1.hdf5")
        self.critic_2.load_weights(name + "critic_2.hdf5")
        
        self.target_actor.load_weights(name + "actor.hdf5")
        self.target_critic_1.load_weights(name + "critic_1.hdf5")
        self.target_critic_2.load_weights(name + "critic_2.hdf5")
    
    def save(self, name):
        self.actor.save_weights(name + "actor.hdf5")
        self.critic_1.save_weights(name + "critic_1.hdf5") 
        self.critic_2.save_weights(name + "critic_2.hdf5") 

In [None]:
agent = Agent()

In [None]:
simulation_freq = 240
control_freq = 48
physics_step = int(simulation_freq/control_freq) #In one env.step call, this number of physics steps will be executed

In [None]:
xy_l = -1.0
xy_h = 1.0
z_l = 0.1
z_h = 2


rpy_l = -8 * np.pi/180
rpy_h =  8 * np.pi/180

change_target = 500


In [None]:
def init_pos_att():
    xy = np.random.uniform(low=xy_l, high=xy_h, size=(1,2))
    z = np.ones(shape=(1,1))*z_l
    return np.concatenate([xy, z], axis =1), np.random.uniform(low=rpy_l, high=rpy_h, size=(1,3))

def generate_target():
    
    target_xy_l = -1.0
    target_xy_h = 1.0
    target_z_l = 1
    target_z_h = 2
    
    xy_target = np.random.uniform(low=target_xy_l, high=target_xy_h, size=(2,))
    z_target = np.random.uniform(low=target_z_l, high=target_z_h, size=(1,))
    return np.concatenate([xy_target, z_target], axis =0)



## Training
For training, first uncomment the cell below and then run it. It will take some time depending on the machine being used.

In [None]:
# #Training 
# best_score = -np.inf
# best_test_score = -np.inf
# score_history = deque(maxlen=100)
# position_err = np.inf
# global_step = 1

# target_position= generate_target()
# print("Target:", target_position)


# for ep in range(1, 1+num_episodes):
    
#     INIT_COOR, INIT_ORIENT = init_pos_att()
    
  
#     env = MyAviary(drone_model=DroneModel.CF2X,
#                  initial_xyzs=INIT_COOR,
#                  initial_rpys=INIT_ORIENT,
#                  physics=Physics.PYB,
#                  freq = simulation_freq,
#                  aggregate_phy_steps=physics_step,
#                  gui=False,
#                  record=False)       
          
    
    
#     env.reset()
    
#     env._addTarget(target_position, visual = False) 
    
#     state = env._computeObs()
#     next_state = state
    
#     done = False
#     score = 0
#     step=0

#     for step in range(num_steps): 
#         position_err = np.linalg.norm(next_state[0:3])
#         if position_err<0.05:
#             with train_summary_writer.as_default():
#                 tf.summary.scalar("Charts/position_err", position_err, global_step)
            
#             target_position = generate_target()
#             env._addTarget(target_position, visual = False) 
#             print("New Target:", target_position)
           
#         if global_step<start_after:
#             action = env.action_space.sample()
#         else:
#             action = agent.act(np.expand_dims(state, axis = 0))
#             action = np.squeeze(action) 

#         next_state, reward, done, _ =  env.step(action)
#         score += reward
#         global_step += 1
 
        
        
#         agent.memory.remember(state, action, reward, next_state, done)
#         state = next_state
        
#         if global_step>update_after:
#             agent.learn()
            
#         if done:            
#             env.close()
#             break
         
#     if not done:
#         env.close()

#     with train_summary_writer.as_default():
#         tf.summary.scalar("Charts/score", score, ep)
#         tf.summary.scalar("Charts/episode_length", step, ep)
#         tf.summary.scalar("Charts/exploration", agent.noise_std, ep)
        
        
#     score_history.append(score)
#     avg_score = np.mean(score_history)
#     print("\n")
#     print(f"Episode: {ep}, Len: {step}, Score: {score}, Avg Score: {avg_score}")  
#     print("\n")
#     if avg_score > best_score:
#         best_score = score
#         agent.save("DDPG_") 
        
#     if ep%25==0:
#         agent.save("Latest_")
        
#         test_target_position= generate_target()
#         print(f"Testing")
#         print("Test Target", test_target_position)
#         for _ in range(2):
#             print("\n")
            
#             INIT_COOR, INIT_ORIENT  = init_pos_att()
            
#             env = MyAviary(drone_model=DroneModel.CF2X,
#                  initial_xyzs=INIT_COOR,
#                  initial_rpys=INIT_ORIENT,
#                  physics=Physics.PYB,
#                  freq = simulation_freq,
#                  aggregate_phy_steps=physics_step,
#                  gui=False,
#                  record=False)
            
            
            
#             env.reset()
            
#             env._addTarget(test_target_position, visual = False)
            
#             state = env._computeObs()
#             next_state = state
#             done = False
#             test_score=0

#             for test_step in range(20_000):

#                 if np.linalg.norm(next_state[0:3])<0.05:
#                     test_target_position = generate_target()
#                     env._addTarget(test_target_position, visual = False)
#                     print("New Test Target", test_target_position)
                
#                 action = agent.act(np.expand_dims(state, axis =0), test=True)
#                 next_state, reward, done, _ = env.step(np.squeeze(action))
                
#                 test_score+= reward
                
#                 state=next_state
                
#                 if done:
#                     env.close()
#                     break
                
#             print("\n")
#             print(f"Episode ended with length {test_step} and a score of {test_score}")
#             print("\n")
            
#         if test_score>best_test_score:
#             best_test_score = test_score
#             agent.save("BestTest_")

## Testing
Run the cells below to test my results

In [None]:
agent.load("BestTest_")

In [None]:
    #Testing
    time_arr = []
    position_err_arr = []
    target_position= generate_target()
    done = False      
    print(f"Target: {target_position}")
    INIT_COOR, INIT_ORIENT = init_pos_att()
    
    env = MyAviary(drone_model=DroneModel.CF2X,
                 initial_xyzs=INIT_COOR,
                 initial_rpys=INIT_ORIENT,
                 physics=Physics.PYB,
                 freq = simulation_freq,
                 aggregate_phy_steps=physics_step,
                 gui=True,
                 record=True)
    
    env.reset()
    env._addTarget(target_position) 
    state = env._computeObs()
    next_state =  state
    
   
    score=0
   
    for t in range(1, 25_000):
        position_err = np.linalg.norm(next_state[0:3])
        
                
        if position_err<0.06:
            
            time_arr.append(t)
            position_err_arr.append(position_err)
            plt.scatter(t, position_err)
            print("Position Error", position_err)
            target_position = generate_target()    
            env._addTarget(target_position)
        
        action = agent.act(np.expand_dims(state, axis =0), test=True)
        next_state, reward, done, _ = env.step(np.squeeze(action))
        score+= reward
        state = next_state
        t += 1
        print(f"Step:{t}, Action:{action}, Position Error:{position_err}, Reward:{reward}")
        if done:
            env.close()
            break
    
    print()    
    print(f"Episode ended in {t} steps with score of {score} \n")
    print()
    plt.plot(time_arr, position_err_arr)
    plt.show()

In [None]:
plt.xlabel("Timesteps")
plt.ylabel("Position Error")
plt.scatter(time_arr, position_err_arr)
plt.show()

In [None]:
env.close()